チュートリアル 2: 正則化手法パート2

第2週, 第1日目: 正則化

Neuromatch Academyによる

コンテンツ作成者: ラヴィ・テジャ・コンキマラ、モヒトラジュ・リンガン・クマライアン、ケビン・マチャド・ガンボア、ケルソン・シリング-スクリボ、ライル・ウンガー

コンテンツレビュアー: ピユシュ・チャウハン、バイ・スイウェイ、ケルソン・シリング-スクリボ

コンテンツ編集者: ロベルト・グイドッティ、スピロス・チャブリス

制作編集者: サイード・サレヒ、ガガナ・B、スピロス・チャブリス

チュートリアルの目的

過剰パラメータ化モデルの縮小としての正則化：L1およびL2
ドロップアウトによる正則化
データ拡張による正則化
ハイパーパラメータ調整の危険性
一般化の再考

# @title Tutorial slides
from IPython.display import IFrame
link_id = "7um6p"
print(f"If you want to download the slides: https://osf.io/download/{link_id}/")
IFrame(src=f"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render", width=854, height=480)

セットアップ

本日のコードの一部は実行に最大1時間かかることがあります。そのため、そのコードは「非表示」にし、結果の出力のみを示しています。

# @title Install dependencies

# @markdown **WARNING**: There may be *errors* and/or *warnings* reported during the installation. However, they should be ignored.

# @title Install and import feedback gadget


from vibecheck import DatatopsContentReviewContainer
def content_review(notebook_section: str):
    return DatatopsContentReviewContainer(
        "",  # No text prompt
        notebook_section,
        {
            "url": "https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab",
            "name": "neuromatch_dl",
            "user_key": "f379rz8y",
        },
    ).render()


feedback_prefix = "W2D1_T2"

# Imports
import copy
import torch
import random
import pathlib

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchvision import transforms
from torchvision.datasets import ImageFolder

from tqdm.auto import tqdm
from IPython.display import HTML, display

# @title Figure Settings
import logging
logging.getLogger('matplotlib.font_manager').disabled = True

import ipywidgets as widgets
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use("https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle")

# @title Loading Animal Faces Data
import requests, os
from zipfile import ZipFile

print("Start downloading and unzipping `AnimalFaces` dataset...")
name = 'afhq'
fname = f"{name}.zip"
url = f"https://osf.io/kgfvj/download"

if not os.path.exists(fname):
  r = requests.get(url, allow_redirects=True)
  with open(fname, 'wb') as fh:
    fh.write(r.content)

  if os.path.exists(fname):
    with ZipFile(fname, 'r') as zfile:
      zfile.extractall(f".")
      os.remove(fname)

print("Download completed.")

# @title Loading Animal Faces Randomized data

print("Start downloading and unzipping `Randomized AnimalFaces` dataset...")

names = ['afhq_random_32x32', 'afhq_10_32x32']
urls = ["https://osf.io/9sj7p/download",
        "https://osf.io/wvgkq/download"]


for i, name in enumerate(names):
  url = urls[i]
  fname = f"{name}.zip"

  if not os.path.exists(fname):
    r = requests.get(url, allow_redirects=True)
    with open(fname, 'wb') as fh:
      fh.write(r.content)

    if os.path.exists(fname):
      with ZipFile(fname, 'r') as zfile:
        zfile.extractall(f".")
        os.remove(fname)

print("Download completed.")

# @title Plotting functions

def imshow(img):
  """
  Display unnormalized image

  Args:
    img: np.ndarray
      Datapoint to visualize

  Returns:
    Nothing
  """
  img = img / 2 + 0.5  # Unnormalize
  npimg = img.numpy()
  plt.imshow(np.transpose(npimg, (1, 2, 0)))
  plt.axis(False)
  plt.show()


def plot_weights(norm, labels, ws, title='Weight Size Measurement'):
  """
  Plot of weight size measurement [norm value vs layer]

  Args:
    norm: float
      Norm values
    labels: list
      Targets
    ws: list
      Weights
    title: string
      Title of plot

  Returns:
    Nothing
  """
  plt.figure(figsize=[8, 6])
  plt.title(title)
  plt.ylabel('Frobenius Norm Value')
  plt.xlabel('Model Layers')
  plt.bar(labels, ws)
  plt.axhline(y=norm,
              linewidth=1,
              color='r',
              ls='--',
              label='Total Model F-Norm')
  plt.legend()
  plt.show()


def visualize_data(dataloader):
  """
  Helper function to visualize data

  Args:
    dataloader: torch.tensor
      Dataloader to visualize

  Returns:
    Nothing
  """
  for idx, (data,label) in enumerate(dataloader):
    plt.figure(idx)
    # Choose the datapoint you would like to visualize
    index = 22

    # Choose that datapoint using index and permute the dimensions
    # and bring the pixel values between [0,1]
    data = data[index].permute(1, 2, 0) * \
           torch.tensor([0.5, 0.5, 0.5]) + \
           torch.tensor([0.5, 0.5, 0.5])

    # Convert the torch tensor into numpy
    data = data.numpy()

    plt.imshow(data)
    plt.axis(False)
    image_class = classes[label[index].item()]
    print(f'The image belongs to : {image_class}')

  plt.show()

# @title Helper functions

class AnimalNet(nn.Module):
  """
  Network Class - Animal Faces with following structure:
  nn.Linear(3 * 32 * 32, 128) # Fully connected layer 1
  nn.Linear(128, 32) # Fully connected layer 2
  nn.Linear(32, 3) # Fully connected layer 3
  """

  def __init__(self):
    """
    Initialize parameters of AnimalNet

    Args:
      None

    Returns:
      Nothing
    """
    super(AnimalNet, self).__init__()
    self.fc1 = nn.Linear(3 * 32 * 32, 128)
    self.fc2 = nn.Linear(128, 32)
    self.fc3 = nn.Linear(32, 3)

  def forward(self, x):
    """
    Forward Pass of AnimalNet

    Args:
      x: torch.tensor
        Input features

    Returns:
      output: torch.tensor
        Outputs/Predictions
    """
    x = x.view(x.shape[0], -1)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    output = F.log_softmax(x, dim=1)
    return output


class Net(nn.Module):
  """
  Network Class - 2D with following structure
  nn.Linear(1, 300) + leaky_relu(self.fc1(x)) # First fully connected layer
  nn.Linear(300, 500) + leaky_relu(self.fc2(x)) # Second fully connected layer
  nn.Linear(500, 1) # Final fully connected layer
  """

  def __init__(self):
    """
    Initialize parameters of Net

    Args:
      None

    Returns:
      Nothing
    """
    super(Net, self).__init__()

    self.fc1 = nn.Linear(1, 300)
    self.fc2 = nn.Linear(300, 500)
    self.fc3 = nn.Linear(500, 1)

  def forward(self, x):
    """
    Forward pass of Net

    Args:
      x: torch.tensor
        Input features

    Returns:
      x: torch.tensor
        Output/Predictions
    """
    x = F.leaky_relu(self.fc1(x))
    x = F.leaky_relu(self.fc2(x))
    output = self.fc3(x)
    return output


class BigAnimalNet(nn.Module):
  """
  Network Class - Animal Faces with following structure:
  nn.Linear(3*32*32, 124) + leaky_relu(self.fc1(x)) # First fully connected layer
  nn.Linear(124, 64) + leaky_relu(self.fc2(x)) # Second fully connected layer
  nn.Linear(64, 3) # Final fully connected layer
  """

  def __init__(self):
    """
    Initialize parameters for BigAnimalNet

    Args:
      None

    Returns:
      Nothing
    """
    super(BigAnimalNet, self).__init__()
    self.fc1 = nn.Linear(3*32*32, 124)
    self.fc2 = nn.Linear(124, 64)
    self.fc3 = nn.Linear(64, 3)

  def forward(self, x):
    """
    Forward pass of BigAnimalNet

    Args:
      x: torch.tensor
        Input features

    Returns:
      x: torch.tensor
        Output/Predictions
    """
    x = x.view(x.shape[0],-1)
    x = F.leaky_relu(self.fc1(x))
    x = F.leaky_relu(self.fc2(x))
    x = self.fc3(x)
    output = F.log_softmax(x, dim=1)
    return output


def train(args, model, train_loader, optimizer, epoch,
          reg_function1=None, reg_function2=None, criterion=F.nll_loss):
  """
  Trains the current input model using the data
  from Train_loader and Updates parameters for a single pass

  Args:
    args: dictionary
      Dictionary with epochs: 200, lr: 5e-3, momentum: 0.9, device: DEVICE
    model: nn.module
      Neural network instance
    train_loader: torch.loader
      Input dataset
    optimizer: function
      Optimizer
    reg_function1: function
      Regularisation function [default: None]
    reg_function2: function
      Regularisation function [default: None]
    criterion: function
      Specifies loss function [default: nll_loss]

  Returns:
    model: nn.module
      Neural network instance post training
  """
  device = args['device']
  model.train()
  for batch_idx, (data, target) in enumerate(train_loader):
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    output = model(data)
    # L1 regularization
    if reg_function2 is None and reg_function1 is not None:
      loss = criterion(output, target) + args['lambda1']*reg_function1(model)
    # L2 regularization
    elif reg_function1 is None and reg_function2 is not None:
      loss = criterion(output, target) + args['lambda2']*reg_function2(model)
    # No regularization
    elif reg_function1 is None and reg_function2 is None:
      loss = criterion(output, target)
    # Both L1 and L2 regularizations
    else:
      loss = criterion(output, target) + args['lambda1']*reg_function1(model) + args['lambda2']*reg_function2(model)
    loss.backward()
    optimizer.step()

  return model


def test(model, test_loader, loader='Test', criterion=F.nll_loss,
         device='cpu'):
  """
  Tests the current model

  Args:
    model: nn.module
      Neural network instance
    device: string
      GPU/CUDA if available, CPU otherwise
    test_loader: torch.loader
      Test dataset
    criterion: function
      Specifies loss function [default: nll_loss]

  Returns:
    test_loss: float
      Test loss
  """
  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in test_loader:
      data, target = data.to(device), target.to(device)
      output = model(data)
      test_loss += criterion(output, target, reduction='sum').item()  # sum up batch loss
      pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability
      correct += pred.eq(target.view_as(pred)).sum().item()

  test_loss /= len(test_loader.dataset)
  return 100. * correct / len(test_loader.dataset)


def main(args, model, train_loader, val_loader, test_data,
         reg_function1=None, reg_function2=None, criterion=F.nll_loss):
  """
  Trains the model with train_loader and
  tests the learned model using val_loader

  Args:
    args: dictionary
      Dictionary with epochs: 200, lr: 5e-3, momentum: 0.9, device: DEVICE
    model: nn.module
      Neural network instance
    train_loader: torch.loader
      Train dataset
    val_loader: torch.loader
      Validation set
    reg_function1: function
      Regularisation function [default: None]
    reg_function2: function
      Regularisation function [default: None]

  Returns:
    val_acc_list: list
      Log of validation accuracy
    train_acc_list: list
      Log of training accuracy
    param_norm_list: list
      Log of frobenius norm
    trained_model: nn.module
      Trained model/model post training
  """
  device = args['device']

  model = model.to(device)
  optimizer = optim.SGD(model.parameters(), lr=args['lr'], momentum=args['momentum'])

  val_acc_list, train_acc_list,param_norm_list = [], [], []
  for epoch in tqdm(range(args['epochs'])):
    trained_model = train(args, model, train_loader, optimizer, epoch,
                          reg_function1=reg_function1,
                          reg_function2=reg_function2)
    train_acc = test(trained_model, train_loader, loader='Train', device=device)
    val_acc = test(trained_model, val_loader, loader='Val', device=device)
    param_norm = calculate_frobenius_norm(trained_model)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)
    param_norm_list.append(param_norm)

  return val_acc_list, train_acc_list, param_norm_list, model


def calculate_frobenius_norm(model):
    """
    Function to calculate frobenius norm

    Args:
      model: nn.module
        Neural network instance

    Returns:
      norm: float
        Frobenius norm
    """
    norm = 0.0
    # Sum the square of all parameters
    for name,param in model.named_parameters():
        norm += torch.norm(param).data**2
    # Return a square root of the sum of squares of all the parameters
    return norm**0.5


def early_stopping_main(args, model, train_loader, val_loader, test_data):
  """
  Function to simulate early stopping

  Args:
    args: dictionary
      Dictionary with epochs: 200, lr: 5e-3, momentum: 0.9, device: DEVICE
    model: nn.module
      Neural network instance
    train_loader: torch.loader
      Train dataset
    val_loader: torch.loader
      Validation set

  Returns:
    val_acc_list: list
      Val accuracy log until early stop point
    train_acc_list: list
      Training accuracy log until early stop point
    best_model: nn.module
      Model performing best with early stopping
    best_epoch: int
      Epoch at which early stopping occurs
  """
  device = args['device']

  model = model.to(device)
  optimizer = optim.SGD(model.parameters(), lr=args['lr'], momentum=args['momentum'])

  best_acc  = 0.0
  best_epoch = 0

  # Number of successive epochs that you want to wait before stopping training process
  patience = 20

  # Keps track of number of epochs during which the val_acc was less than best_acc
  wait = 0

  val_acc_list, train_acc_list = [], []
  for epoch in tqdm(range(args['epochs'])):
    trained_model = train(args, model, device, train_loader, optimizer, epoch)
    train_acc = test(trained_model, train_loader, loader='Train', device=device)
    val_acc = test(trained_model, val_loader, loader='Val', device=device)
    if (val_acc > best_acc):
      best_acc = val_acc
      best_epoch = epoch
      best_model = copy.deepcopy(trained_model)
      wait = 0
    else:
      wait += 1
    if (wait > patience):
      print(f'Early stopped on epoch: {epoch}')
      break
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)

  return val_acc_list, train_acc_list, best_model, best_epoch

# @title Set random seed
# @markdown Executing `set_seed(seed=seed)` you are setting the seed

# For DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.
# Read more here: https://pytorch.org/docs/stable/notes/randomness.html

# Call `set_seed` function in the exercises to ensure reproducibility.
import random
import torch

def set_seed(seed=None, seed_torch=True):
  """
  Function that controls randomness. NumPy and random modules must be imported.

  Args:
    seed : Integer
      A non-negative integer that defines the random state. Default is `None`.
    seed_torch : Boolean
      If `True` sets the random seed for pytorch tensors, so pytorch module
      must be imported. Default is `True`.

  Returns:
    Nothing.
  """
  if seed is None:
    seed = np.random.choice(2 ** 32)
  random.seed(seed)
  np.random.seed(seed)
  if seed_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

  print(f'Random seed {seed} has been set.')


# In case that `DataLoader` is used
def seed_worker(worker_id):
  """
  DataLoader will reseed workers following randomness in
  multi-process data loading algorithm.

  Args:
    worker_id: integer
      ID of subprocess to seed. 0 means that
      the data will be loaded in the main process
      Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details

  Returns:
    Nothing
  """
  worker_seed = torch.initial_seed() % 2**32
  np.random.seed(worker_seed)
  random.seed(worker_seed)

# @title Set device (GPU or CPU). Execute `set_device()`
# especially if torch modules used.

# Inform the user if the notebook uses GPU or CPU.

def set_device():
  """
  Set the device. CUDA if available, CPU otherwise

  Args:
    None

  Returns:
    Nothing
  """
  device = "cuda" if torch.cuda.is_available() else "cpu"
  if device != "cuda":
    print("WARNING: For this notebook to perform best, "
        "if possible, in the menu under `Runtime` -> "
        "`Change runtime type.`  select `GPU` ")
  else:
    print("GPU is enabled in this notebook.")

  return device

SEED = 2021
set_seed(seed=SEED)
DEVICE = set_device()

# @title Dataloaders for the Dataset
## Dataloaders for the Dataset
batch_size = 128
classes = ('cat', 'dog', 'wild')

train_transform = transforms.Compose([
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
     ])
data_path = pathlib.Path('.')/'afhq' # Using pathlib to be compatible with all OS's
img_dataset = ImageFolder(data_path/'train', transform=train_transform)


####################################################
g_seed = torch.Generator()
g_seed.manual_seed(SEED)


## Dataloaders for the  Original Dataset
img_train_data, img_val_data,_ = torch.utils.data.random_split(img_dataset,
                                                               [100, 100, 14430])

# Creating train_loader and Val_loader
train_loader = torch.utils.data.DataLoader(img_train_data,
                                           batch_size=batch_size,
                                           worker_init_fn=seed_worker,
                                           num_workers=2,
                                           generator=g_seed)
val_loader = torch.utils.data.DataLoader(img_val_data,
                                         batch_size=1000,
                                         num_workers=2,
                                         worker_init_fn=seed_worker,
                                         generator=g_seed)

# Creating test dataset
test_transform = transforms.Compose([
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
     ])
img_test_dataset = ImageFolder(data_path/'val', transform=test_transform)


####################################################

## Dataloaders for the  Random Dataset

# Splitting randomized data into training and validation data
data_path = pathlib.Path('.')/'afhq_random_32x32/afhq_random' # using pathlib to be compatible with all OS's
img_dataset = ImageFolder(data_path/'train', transform=train_transform)
random_img_train_data, random_img_val_data,_ = torch.utils.data.random_split(img_dataset, [100,100,14430])

# Randomized train and validation dataloader
rand_train_loader = torch.utils.data.DataLoader(random_img_train_data,
                                                batch_size=batch_size,
                                                num_workers=2,
                                                worker_init_fn=seed_worker,
                                                generator=g_seed)
rand_val_loader = torch.utils.data.DataLoader(random_img_val_data,
                                              batch_size=1000,
                                              num_workers=2,
                                              worker_init_fn=seed_worker,
                                              generator=g_seed)

####################################################

## Dataloaders for the Partially Random Dataset

# Splitting data between training and validation dataset for partially randomized data
data_path = pathlib.Path('.')/'afhq_10_32x32/afhq_10' # using pathlib to be compatible with all OS's
img_dataset = ImageFolder(data_path/'train', transform=train_transform)
partially_random_train_data, partially_random_val_data, _ = torch.utils.data.random_split(img_dataset, [100,100,14430])

# Training and Validation loader for partially randomized data
partial_rand_train_loader = torch.utils.data.DataLoader(partially_random_train_data,
                                                        batch_size=batch_size,
                                                        num_workers=2,
                                                        worker_init_fn=seed_worker,
                                                        generator=g_seed)
partial_rand_val_loader = torch.utils.data.DataLoader(partially_random_val_data,
                                                      batch_size=1000,
                                                      num_workers=2,
                                                      worker_init_fn=seed_worker,
                                                      generator=g_seed)

セクション1: L1およびL2正則化

所要時間の目安: 約30分

# @title Video 1: L1 and L2 regularization
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'oQNdloKdysM'), ('Bilibili', 'BV19h41167H7')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_L1_and_L2_regularization_Video")

すでに他のコースでL1およびL2正則化に触れたことがある方もいるかもしれません。L1とL2は最も一般的な正則化の種類です。これらは一般的なコスト関数に正則化項と呼ばれる別の項を追加して更新します。

\text{コスト関数} = 損失(\text{例：バイナリクロスエントロピー}) + \text{正則化項}

この正則化項はパラメータを小さくし、より単純なモデルを作ることで過学習を減らします。

チームメイトと上記の仮定が良いか悪いかについて話し合ってみてください。

セクション1.1: 正則化なしモデル

# @markdown #### Dataloaders for Regularization
data_path = pathlib.Path('.')/'afhq' # Using pathlib to be compatible with all OS's
img_dataset = ImageFolder(data_path/'train', transform=train_transform)

# Splitting dataset
reg_train_data, reg_val_data,_ = torch.utils.data.random_split(img_dataset,
                                                               [30, 100, 14500])
g_seed = torch.Generator()
g_seed.manual_seed(SEED)

# Creating train_loader and Val_loader
reg_train_loader = torch.utils.data.DataLoader(reg_train_data,
                                               batch_size=batch_size,
                                               worker_init_fn=seed_worker,
                                               num_workers=2,
                                               generator=g_seed)
reg_val_loader = torch.utils.data.DataLoader(reg_val_data,
                                             batch_size=1000,
                                             worker_init_fn=seed_worker,
                                             num_workers=2,
                                             generator=g_seed)

まずは正則化なしでモデルを訓練し、このセクションのベンチマークとして取っておきましょう。

# Set the arguments
args = {
    'epochs': 150,
    'lr': 5e-3,
    'momentum': 0.99,
    'device': DEVICE,
}

# Initialize the model
set_seed(seed=SEED)
model = AnimalNet()

# Train the model
val_acc_unreg, train_acc_unreg, param_norm_unreg, _ = main(args,
                                                           model,
                                                           reg_train_loader,
                                                           reg_val_loader,
                                                           img_test_dataset)

# Train and Test accuracy plot
plt.figure()
plt.plot(val_acc_unreg, label='Val Accuracy', c='red', ls='dashed')
plt.plot(train_acc_unreg, label='Train Accuracy', c='red', ls='solid')
plt.axhline(y=max(val_acc_unreg), c='green', ls='dashed')
plt.title('Unregularized Model')
plt.ylabel('Accuracy (%)')
plt.xlabel('Epoch')
plt.legend()
plt.show()
print(f"Maximum Validation Accuracy reached: {max(val_acc_unreg)}")

セクション1.2: L1正則化

L1正則化（またはLASSO $^{\ddagger}$ ）は、ディープラーニングのすべての重みの絶対値の和をペナルティとして用い、以下の損失関数を得ます（ $L$ は通常のクロスエントロピー損失）：

L_R = L + \lambda \sum \left| w^{(r)}_{ij} \right|

ここで $r$ は層を、 $ij$ はその層内の特定の重みを表します。

大まかに言うと、L1正則化はL2正則化に似ており、重みを小さくします（次の節で類似点がわかります）。確率的勾配降下法を用いた場合の重み更新式は以下の通りです：

w^{(r)}_{ij}←w^{(r)}_{ij} − \eta \cdot \lambda \cdot \text{sgn}\left(w^{(r)}_{ij}\right)−\eta \frac{\partial L}{\partial w_{ij}^{(r)}}

ここで $\text{sgn}(\cdot)$ は符号関数であり、

\text{sgn}(w) = \left\{ \begin{array}{ll} +1 & \mbox{if } w > 0 \\ -1 & \mbox{if } w < 0 \\ 0 & \mbox{if } w = 0 \end{array} \right.

$^{\ddagger}$ LASSO: 最小絶対収縮および選択演算子

コーディング演習 1.1: L1正則化

PyTorchモデルのすべてのテンソルのL1ノルムを計算する関数を書いてください。

def l1_reg(model):
  """
  This function calculates the l1 norm of the all the tensors in the model

  Args:
    model: nn.module
      Neural network instance

  Returns:
    l1: float
      L1 norm of the all the tensors in the model
  """
  l1 = 0.0
  ####################################################################
  # Fill in all missing code below (...),
  # then remove or comment the line below to test your function
  raise NotImplementedError("Complete the l1_reg function")
  ####################################################################
  for param in model.parameters():
    l1 += ...

  return l1


set_seed(seed=SEED)
## uncomment to test
# net = nn.Linear(20, 20)
# print(f"L1 norm of the model: {l1_reg(net)}")

ランダムシード2021が設定されました。
モデルのL1ノルム: 48.445133209228516

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_L1_regularization_Exercise")

次に、L1正則化を用いた分類器を訓練します。検証精度が正則化なしモデルより高くなるようにハイパーパラメータlambda1を調整してください。

# Set the arguments
args1 = {
    'test_batch_size': 1000,
    'epochs': 150,
    'lr': 5e-3,
    'momentum': 0.99,
    'device': DEVICE,
    'lambda1': 0.001  # <<<<<<<< Tune the hyperparameter lambda1
}

# Initialize the model
set_seed(seed=SEED)
model = AnimalNet()

# Train the model
val_acc_l1reg, train_acc_l1reg, param_norm_l1reg, _ = main(args1,
                                                           model,
                                                           reg_train_loader,
                                                           reg_val_loader,
                                                           img_test_dataset,
                                                           reg_function1=l1_reg)

# Train and Test accuracy plot
plt.figure()
plt.plot(val_acc_l1reg, label='Val Accuracy L1 Regularized',
         c='red', ls='dashed')
plt.plot(train_acc_l1reg, label='Train Accuracy L1 regularized',
         c='red', ls='solid')
plt.axhline(y=max(val_acc_l1reg), c='green', ls='dashed')
plt.title('L1 regularized model')
plt.ylabel('Accuracy (%)')
plt.xlabel('Epoch')
plt.legend()
plt.show()
print(f"Maximum Validation Accuracy Reached: {max(val_acc_l1reg)}")

L1正則化で効果的だったlambda1の値はいくつでしたか？

注意: 式中の $\lambda$ はコード中のlambda1に対応しています。

# @title Submit your feedback
content_review(f"{feedback_prefix}_Tune_lambda1_Exercise")

セクション1.3: L2 / リッジ正則化

L2正則化（またはリッジ）、別名「重み減衰」は広く使われています。これはクロスエントロピー損失関数 $L$ に二次のペナルティ項を加え、以下の新しい損失関数 $L_R$ を得ます：

L_R = L + \lambda \sum \left( w^{(r)}_{ij} \right)^2

ここでも $r$ は層、 $ij$ はその層内の特定の重みを示します。

L2正則化の理解を深めるために、勾配降下法に基づく重みとバイアスの更新式への影響を調べます。上記の式の両辺を微分すると、

\frac{\partial L_R}{\partial w^{(r)}_{ij}}=\frac{\partial L}{\partial w^{(r)}_{ij}} + 2\lambda w^{(r)}_{ij}

したがって、重みの更新則は以下のようになります：

w^{(r)}_{ij}←w^{(r)}_{ij}−η\frac{\partial L}{\partial w^{(r)}_{ij}}−2 \eta \lambda w^{(r)}_{ij}=(1−2 \eta \lambda)w^{(r)}_{ij} − \eta \frac{\partial L}{\partial w^{(r)}_{ij}}

ここで $\eta$ は学習率です。

コーディング演習 1.2: L2正則化

PyTorchモデルのすべてのテンソルのL2ノルムを計算する関数を書いてください。（以前は何と呼んでいましたか？）

def l2_reg(model):
  """
  This function calculates the l2 norm of the all the tensors in the model

  Args:
    model: nn.module
      Neural network instance

  Returns:
    l2: float
      L2 norm of the all the tensors in the model
  """

  l2 = 0.0
  ####################################################################
  # Fill in all missing code below (...),
  # then remove or comment the line below to test your function
  raise NotImplementedError("Complete the l2_reg function")
  ####################################################################
  for param in model.parameters():
    l2 += ...

  return l2


set_seed(SEED)
## uncomment to test
# net = nn.Linear(20, 20)
# print(f"L2 norm of the model: {l2_reg(net)}")

ランダムシード2021が設定されました。
モデルのL2ノルム: 7.328375816345215

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_L2_Ridge_Regularization_Exercise")

次に、L2正則化を用いた分類器を訓練します。検証精度が正則化なしモデルより高くなるようにハイパーパラメータlambda2を調整してください。

# Set the arguments
args2 = {
    'test_batch_size': 1000,
    'epochs': 150,
    'lr': 5e-3,
    'momentum': 0.99,
    'device': DEVICE,
    'lambda2': 0.001  # <<<<<<<< Tune the hyperparameter lambda2
}

# Initialize the model
set_seed(seed=SEED)
model = AnimalNet()

# Train the model
val_acc_l2reg, train_acc_l2reg, param_norm_l2reg, model = main(args2,
                                                               model,
                                                               train_loader,
                                                               val_loader,
                                                               img_test_dataset,
                                                               reg_function2=l2_reg)

## Train and Test accuracy plot
plt.figure()
plt.plot(val_acc_l2reg, label='Val Accuracy L2 regularized',
         c='red', ls='dashed')
plt.plot(train_acc_l2reg, label='Train Accuracy L2 regularized',
         c='red', ls='solid')
plt.axhline(y=max(val_acc_l2reg), c='green', ls='dashed')
plt.title('L2 Regularized Model')
plt.ylabel('Accuracy (%)')
plt.xlabel('Epoch')
plt.legend()
plt.show()
print(f"Maximum Validation Accuracy reached: {max(val_acc_l2reg)}")

L2正則化で効果的だったlambda2の値はいくつでしたか？

注意: 式中の $\lambda$ はコード中のlambda2に対応しています。

# @title Submit your feedback
content_review(f"{feedback_prefix}_Tune_lambda2_Exercise")

次に、L1とL2の両方の正則化項を含むモデルを実行してみましょう。

# @markdown Visualize all of them together (Run Me!)

# @markdown `lambda1=0.001` and `lambda2=0.001`

args3 = {
    'test_batch_size': 1000,
    'epochs': 150,
    'lr': 5e-3,
    'momentum': 0.99,
    'device': DEVICE,
    'lambda1': 0.001,
    'lambda2': 0.001
}

# Initialize the model
set_seed(seed=SEED)
model = AnimalNet()
val_acc_l1l2reg, train_acc_l1l2reg, param_norm_l1l2reg, _ = main(args3,
                                                                 model,
                                                                 train_loader,
                                                                 val_loader,
                                                                 img_test_dataset,
                                                                 reg_function1=l1_reg,
                                                                 reg_function2=l2_reg)

plt.figure()

plt.plot(val_acc_l2reg, c='red', ls='dashed')
plt.plot(train_acc_l2reg,
         label=f"L2 regularized, $\lambda_2$={args2['lambda2']}",
         c='red', ls='solid')
plt.axhline(y=max(val_acc_l2reg), c='red', ls='dashed')

plt.plot(val_acc_l1reg, c='green', ls = 'dashed')
plt.plot(train_acc_l1reg,
         label=f"L1 regularized, $\lambda_1$={args1['lambda1']}",
         c='green', ls='solid')
plt.axhline(y=max(val_acc_l1reg), c='green', ls='dashed')

plt.plot(val_acc_unreg, c='blue', ls = 'dashed')
plt.plot(train_acc_unreg,
         label='Unregularized', c='blue', ls='solid')
plt.axhline(y=max(val_acc_unreg), c='blue', ls='dashed')

plt.plot(val_acc_l1l2reg, c='orange', ls='dashed')
plt.plot(train_acc_l1l2reg,
         label=f"L1+L2 regularized, $\lambda_1$={args3['lambda1']}, $\lambda_2$={args3['lambda2']}",
         c='orange', ls='solid')
plt.axhline(y=max(val_acc_l1l2reg), c='orange', ls = 'dashed')

plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.show()

次に、これらの異なる正則化がモデルのパラメータにどのような影響を与えるかを可視化します。パラメータの大きさ（厳密にはフロベニウスノルム）を計算して効果を観察します。

x =  param_norm_unreg[0]
print(x)

# @markdown #### Visualize Norm of the Models (Train Me!)
plt.figure()
plt.plot([i.cpu().numpy() for i in param_norm_unreg],
         label='Unregularized', c='blue')
plt.plot([i.cpu().numpy() for i in param_norm_l1reg],
         label='L1 Regularized', c='green')
plt.plot([i.cpu().numpy() for i in param_norm_l2reg],
         label='L2 Regularized', c='red')
plt.plot([i.cpu().numpy() for i in param_norm_l1l2reg],
         label='L1+L2 Regularized', c='orange')
plt.xlabel('Epoch')
plt.ylabel('Parameter Norms')
plt.legend()
plt.show()

上記のプロットでは、モデルが100%の訓練精度を達成した後も検証精度が変動しているのが見えたはずです。なぜこのようなことが起きるのでしょうか？

セクション2: ドロップアウト

所要時間の目安: 約25分

# @title Video 2: Dropout
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'UZfUzawej3A'), ('Bilibili', 'BV1gU4y1G7V2')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_Dropout_Video")

ドロップアウトでは、訓練中に文字通り一部のニューロンを「ドロップアウト」（ゼロにする）します。訓練中は通常、各層のノードの約50%をランダムにゼロにし、そのたびに異なるノードの組み合わせを選ぶことでノイズを導入し、過学習を減らします。

先ほど生成したおもちゃデータセットに戻り、ドロップアウトがノイズの多いデータセットでの訓練をどのように安定化させるかを可視化しましょう。先ほどのアーキテクチャを少し変更してドロップアウト層を追加します。

class NetDropout(nn.Module):
  """
  Network Class - 2D with the following structure:
  nn.Linear(1, 300) + leaky_relu(self.dropout1(self.fc1(x))) # First fully connected layer with 0.4 dropout
  nn.Linear(300, 500) + leaky_relu(self.dropout2(self.fc2(x))) # Second fully connected layer with 0.2 dropout
  nn.Linear(500, 1) # Final fully connected layer
  """

  def __init__(self):
    """
    Initialize parameters of NetDropout

    Args:
      None

    Returns:
      Nothing
    """
    super(NetDropout, self).__init__()

    self.fc1 = nn.Linear(1, 300)
    self.fc2 = nn.Linear(300, 500)
    self.fc3 = nn.Linear(500, 1)
    # We add two dropout layers
    self.dropout1 = nn.Dropout(0.4)
    self.dropout2 = nn.Dropout(0.2)

  def forward(self, x):
    """
    Forward pass of NetDropout

    Args:
      x: torch.tensor
        Input features

    Returns:
      output: torch.tensor
        Output/Predictions
    """
    x = F.leaky_relu(self.dropout1(self.fc1(x)))
    x = F.leaky_relu(self.dropout2(self.fc2(x)))
    output = self.fc3(x)
    return output

# @markdown #### Run to train the default network
set_seed(seed=SEED)

# Creating train data
X = torch.rand((10, 1))
X.sort(dim = 0)
Y = 2*X + 2*torch.empty((X.shape[0], 1)).normal_(mean=0, std=1)  # adding small error in the data

X = X.unsqueeze_(1)
Y = Y.unsqueeze_(1)

# Creating test dataset
X_test = torch.linspace(0, 1, 40)
X_test = X_test.reshape((40, 1, 1))

# Train the network on toy dataset
model = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
max_epochs = 10000
iters = 0

running_predictions = np.empty((40, (int)(max_epochs/500 + 1)))

train_loss = []
test_loss = []
model_norm = []

for epoch in tqdm(range(max_epochs)):

  # Training
  model_norm.append(calculate_frobenius_norm(model))
  model.train()
  optimizer.zero_grad()
  predictions = model(X)
  loss = criterion(predictions,Y)
  loss.backward()
  optimizer.step()

  train_loss.append(loss.data)
  model.eval()
  Y_test = model(X_test)
  loss = criterion(Y_test, 2*X_test)
  test_loss.append(loss.data)

  if (epoch % 500 == 0 or epoch == max_epochs - 1):
    running_predictions[:, iters] = Y_test[:, 0, 0].detach().numpy()
    iters += 1

# Train the network on toy dataset

# Initialize the model
set_seed(seed=SEED)
model = NetDropout()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
max_epochs = 10000
iters = 0

running_predictions_dp = np.empty((40, (int)(max_epochs / 500)))

train_loss_dp = []
test_loss_dp = []
model_norm_dp = []

for epoch in tqdm(range(max_epochs)):

  # Training
  model_norm_dp.append(calculate_frobenius_norm(model))
  model.train()
  optimizer.zero_grad()
  predictions = model(X)
  loss = criterion(predictions, Y)
  loss.backward()
  optimizer.step()

  train_loss_dp.append(loss.data)
  model.eval()
  Y_test = model(X_test)
  loss = criterion(Y_test, 2*X_test)
  test_loss_dp.append(loss.data)

  if (epoch % 500 == 0 or epoch == max_epochs):
    running_predictions_dp[:, iters] = Y_test[:, 0, 0].detach().numpy()
    iters += 1

訓練が終わったので、訓練過程でモデルがどのように変化したかを見てみましょう。

# @markdown Animation! (Run Me!)
set_seed(seed=SEED)

fig = plt.figure(figsize=(8, 6))
ax = plt.axes()

def frame(i):
  ax.clear()
  ax.scatter(X[:, 0, :].numpy(), Y[:, 0, :].numpy())
  plot = ax.plot(X_test[:, 0, :].detach().numpy(),
                 running_predictions_dp[:, i])
  title = f"Epoch: {i*500}"
  plt.title(title)
  ax.set_xlabel("X axis")
  ax.set_ylabel("Y axis")
  return plot


anim = animation.FuncAnimation(fig, frame, frames=range(20),
                               blit=False, repeat=False,
                               repeat_delay=10000)
html_anim = HTML(anim.to_html5_video());
plt.close()
display(html_anim)

# @markdown Plot the train and test losses with epoch

plt.figure()
plt.plot(test_loss_dp, label='Test loss dropout', c='blue', ls='dashed')
plt.plot(test_loss, label='Test loss', c='red', ls='dashed')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.title('Dropout vs Without dropout')
plt.legend()
plt.show()

# @markdown Plot the train and test losses with epoch

plt.figure()
plt.plot(train_loss_dp, label='Train loss dropout', c='blue', ls='dashed')
plt.plot(train_loss, label='Train loss', c='red', ls='dashed')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.title('Dropout vs Without dropout')
plt.legend()
plt.show()

# @markdown Plot model weights with epoch
plt.figure()
plt.plot(model_norm_dp, label='Dropout')
plt.plot(model_norm, label='No dropout')
plt.ylabel('Norm of the model')
plt.xlabel('Epochs')
plt.legend()
plt.title('Size of the model vs Epochs')
plt.show()

考えてみよう 2.1!: ドロップアウト

ドロップアウトありのモデルは、初期のドロップアウトなしモデルよりも良い性能を示したと思いますか？

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_Dropout_Discussion")

セクション2.1: ドロップアウト実装上の注意点

ドロップアウトは訓練時のみ使用します。テスト時はモデル全体の重みを使うため、テスト前に必ずmodel.eval()メソッドを呼ぶことが重要です。
ドロップアウトは訓練中にモデルの容量を減らすため、一般的にはドロップアウトを使う場合はより広いネットワークを用います。例えば、ドロップアウト確率が0.5の場合、その層の隠れニューロン数を2倍にすることが推奨されます。

次に、「Animal Faces」データセットでドロップアウトがどのように機能するかを見てみましょう。既存のモデルにドロップアウトを追加してから訓練します。

class AnimalNetDropout(nn.Module):
  """
  Network Class - Animal Faces with following structure
  nn.Linear(3*32*32, 248) + leaky_relu(self.dropout1(self.fc1(x))) # First fully connected layer with 0.5 dropout
  nn.Linear(248, 210) + leaky_relu(self.dropout2(self.fc2(x))) # Second fully connected layer with 0.3 dropout
  nn.Linear(210, 3) # Final fully connected layer
  """

  def __init__(self):
    """
    Initialize parameters of AnimalNetDropout

    Args:
      None

    Returns:
      Nothing
    """
    super(AnimalNetDropout, self).__init__()
    self.fc1 = nn.Linear(3*32*32, 248)
    self.fc2 = nn.Linear(248, 210)
    self.fc3 = nn.Linear(210, 3)
    self.dropout1 = nn.Dropout(p=0.5)
    self.dropout2 = nn.Dropout(p=0.3)

  def forward(self, x):
    """
    Forward pass of AnimalNetDropout

    Args:
      x: torch.tensor
        Input features

    Returns:
      x: torch.tensor
        Output/Predictions
    """
    x = x.view(x.shape[0], -1)
    x = F.leaky_relu(self.dropout1(self.fc1(x)))
    x = F.leaky_relu(self.dropout2(self.fc2(x)))
    x = self.fc3(x)
    output = F.log_softmax(x, dim=1)
    return output

# Set the arguments
args = {
    'test_batch_size': 1000,
    'epochs': 200,
    'lr': 5e-3,
    'batch_size': 32,
    'momentum': 0.9,
    'device': DEVICE,
    'log_interval': 100
}

# Initialize the model
set_seed(seed=SEED)
model = AnimalNetDropout()

# Train the model with Dropout
val_acc_dropout, train_acc_dropout, _, model_dp = main(args,
                                                       model,
                                                       train_loader,
                                                       val_loader,
                                                       img_test_dataset)

# Initialize the BigAnimalNet model
set_seed(seed=SEED)
model = BigAnimalNet()

# Train the model
val_acc_big, train_acc_big, _, model_big = main(args,
                                                model,
                                                train_loader,
                                                val_loader,
                                                img_test_dataset)


# Train and Test accuracy plot
plt.figure()
plt.plot(val_acc_big, label='Val - Big', c='blue', ls='dashed')
plt.plot(train_acc_big, label='Train - Big', c='blue', ls='solid')
plt.plot(val_acc_dropout, label='Val - DP', c='magenta', ls='dashed')
plt.plot(train_acc_dropout, label='Train - DP', c='magenta', ls='solid')
plt.title('Dropout')
plt.ylabel('Accuracy (%)')
plt.xlabel('Epoch')
plt.legend()
plt.show()

考えてみよう 2.2! ドロップアウトの注意点

ドロップアウトが悪影響を及ぼす場合はどんな時だと思いますか？また、モデル内での配置は重要だと思いますか？

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_Dropout_Caveats_Discussion")

セクション3: データ拡張

所要時間の目安: 約15分

# @title Video 3: Data Augmentation
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'nm44FhjL3xc'), ('Bilibili', 'BV1Xw411d7Pz')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_Data_Augmentation_Video")

データ拡張は訓練サンプル数を増やすためによく使われます。ここでは、各エポック後に訓練データにノイズを加えることで正則化を実現する効果を探ります。

PyTorchのtorchvisionモジュールには画像データセットに使えるいくつかの組み込みデータ拡張手法があります。よく使う手法は以下の通りです：

ランダムクロップ
ランダム回転
垂直反転
水平反転

# @markdown ####  Data Loader without Data Augmentation

# For reproducibility
g_seed = torch.Generator()
g_seed.manual_seed(SEED)


train_transform = transforms.Compose([
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
     ])
data_path = pathlib.Path('.')/'afhq' # Using pathlib to be compatible with all OS's
img_dataset = ImageFolder(data_path/'train', transform=train_transform)

# Splitting dataset
img_train_data, img_val_data,_ = torch.utils.data.random_split(img_dataset, [250,100,14280])

# Creating train_loader and Val_loader
train_loader = torch.utils.data.DataLoader(img_train_data,
                                           batch_size=batch_size,
                                           num_workers=2,
                                           worker_init_fn=seed_worker,
                                           generator=g_seed)
val_loader = torch.utils.data.DataLoader(img_val_data,
                                         batch_size=1000,
                                         num_workers=2,
                                         worker_init_fn=seed_worker,
                                         generator=g_seed)

torchvision.transformsを使ってデータをランダムに拡張するDataLoaderを定義してください。詳細はこちらを参照してください。

# Data Augmentation using transforms
new_transforms = transforms.Compose([
                                     transforms.RandomHorizontalFlip(p=0.1),
                                     transforms.RandomVerticalFlip(p=0.1),
                                     transforms.ToTensor(),
                                     transforms.Normalize((0.5, 0.5, 0.5),
                                                          (0.5, 0.5, 0.5))
                                     ])

data_path = pathlib.Path('.')/'afhq'  # Using pathlib to be compatible with all OS's
img_dataset = ImageFolder(data_path/'train', transform=new_transforms)
# Splitting dataset
new_train_data, _,_ = torch.utils.data.random_split(img_dataset,
                                                    [250, 100, 14280])

# For reproducibility
g_seed = torch.Generator()
g_seed.manual_seed(SEED)

# Creating train_loader and Val_loader
new_train_loader = torch.utils.data.DataLoader(new_train_data,
                                               batch_size=batch_size,
                                               worker_init_fn=seed_worker,
                                               generator=g_seed)

# Set the arguments
args = {
    'epochs': 250,
    'lr': 1e-3,
    'momentum': 0.99,
    'device': DEVICE,
}

# Initialize the model
set_seed(seed=SEED)
model_aug = AnimalNet()

# Train the model
val_acc_dataaug, train_acc_dataaug, param_norm_dataaug, _ = main(args,
                                                                 model_aug,
                                                                 new_train_loader,
                                                                 val_loader,
                                                                 img_test_dataset)
# Initialize the model
set_seed(seed=SEED)
model_pure = AnimalNet()

val_acc_pure, train_acc_pure, param_norm_pure, _, = main(args,
                                                         model_pure,
                                                         train_loader,
                                                         val_loader,
                                                         img_test_dataset)


# Train and Test accuracy plot
plt.figure()
plt.plot(val_acc_pure, label='Val Accuracy Pure',
         c='red', ls='dashed')
plt.plot(train_acc_pure, label='Train Accuracy Pure',
         c='red', ls='solid')
plt.plot(val_acc_dataaug, label='Val Accuracy data augment',
         c='blue', ls='dashed')
plt.plot(train_acc_dataaug, label='Train Accuracy data augment',
         c='blue', ls='solid')
plt.axhline(y=max(val_acc_pure), c='red', ls='dashed')
plt.axhline(y=max(val_acc_dataaug), c='blue', ls='dashed')
plt.title('Data Augmentation')
plt.ylabel('Accuracy (%)')
plt.xlabel('Epoch')
plt.legend()
plt.show()

# Plot together: without and with augmentation
plt.figure()
plt.plot([i.cpu().numpy().item() for i in param_norm_pure],
         c='red', label='Without Augmentation')
plt.plot([i.cpu().numpy().item() for i in param_norm_dataaug],
         c='blue', label='With Augmentation')
plt.title('Norm of parameters as a function of training epoch')
plt.xlabel('Epoch')
plt.ylabel('Norm of model parameters')
plt.legend()
plt.show()

考えてみよう 3.1!: データ拡張

訓練データを拡張する他の方法を思いつきますか？（物体認識以外の問題も考えてみてください）

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_Data_Augmentation_Discussuion")

考えてみよう 3.2!: 過剰パラメータ化モデル vs 小規模NN

なぜ小さなニューラルネットワークよりも過剰パラメータ化されたANNを正則化する方が良いのでしょうか？知っている正則化手法を考慮して、各グループで10分間議論してください。

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_Overparameterized_vs_Small_NN_Discussuion")

セクション4: 確率的勾配降下法

所要時間の目安: 約20分

# @title Video 4: SGD
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'rjzlFvJhNqE'), ('Bilibili', 'BV1nM4y1K7wP')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_SGD_Video")

セクション4.1: 学習率

このセクションでは、学習率がニューラルネットワークの訓練時に正則化として働く様子を見ます。まとめると：

小さい学習率は正則化効果が弱く、ゆっくりと深い極小値に収束する。
大きい学習率は正則化効果が強く、局所極小値を飛び越えてより広く平坦な極小値に収束しやすく、これが一般化性能を高めることが多い。

ただし、非常に大きい学習率はオーバーシュートや悪い局所極小値に陥る可能性があるので注意が必要です。

以下のブロックでは、異なる学習率でAnimalNetモデルを訓練し、正則化への影響を観察します。

# @markdown #### Generating Data Loaders

# For reproducibility
g_seed = torch.Generator()
g_seed.manual_seed(SEED)

batch_size = 128
train_transform = transforms.Compose([
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
     ])

data_path = pathlib.Path('.')/'afhq' # Using pathlib to be compatible with all OS's
img_dataset = ImageFolder(data_path/'train', transform=train_transform)
img_train_data, img_val_data, = torch.utils.data.random_split(img_dataset, [11700,2930])

full_train_loader = torch.utils.data.DataLoader(img_train_data,
                                                batch_size=batch_size,
                                                num_workers=2,
                                                worker_init_fn=seed_worker,
                                                generator=g_seed)
full_val_loader = torch.utils.data.DataLoader(img_val_data,
                                              batch_size=1000,
                                              num_workers=2,
                                              worker_init_fn=seed_worker,
                                              generator=g_seed)

test_transform = transforms.Compose([
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
     ])
img_test_dataset = ImageFolder(data_path/'val', transform=test_transform)
# With dataloaders: img_test_loader = DataLoader(img_test_dataset, batch_size=batch_size,shuffle=False, num_workers=1)
classes = ('cat', 'dog', 'wild')

# Set the arguments
args = {
    'test_batch_size': 1000,
    'epochs': 20,
    'batch_size': 32,
    'momentum': 0.99,
    'device': DEVICE
}

learning_rates = [5e-4, 1e-3, 5e-3]
acc_dict = {}

for i, lr in enumerate(learning_rates):
  # Initialize the model
  set_seed(seed=SEED)
  model = AnimalNet()
  # Learning rate
  args['lr'] = lr
  # Train the model
  val_acc, train_acc, param_norm, _ = main(args,
                                           model,
                                           train_loader,
                                           val_loader,
                                           img_test_dataset)
  # Store the outputs
  acc_dict[f'val_{i}'] = val_acc
  acc_dict[f'train_{i}'] = train_acc
  acc_dict[f'param_norm_{i}'] = param_norm

# @markdown Plot Train and Validation accuracy (Run me)
plt.figure()
for i, lr in enumerate(learning_rates):
  plt.plot(acc_dict[f'val_{i}'], linestyle='dashed',
          label=f'lr={lr:0.1e} - validation')
  plt.plot(acc_dict[f'train_{i}'], label=f'{lr:0.1e} - train')

  print(f"Maximum Test Accuracy obtained with lr={lr:0.1e}: {max(acc_dict[f'val_{i}'])}")

plt.title('Optimal Learning Rate')
plt.ylabel('Accuracy (%)')
plt.xlabel('Epoch')
plt.legend()
plt.show()

# @markdown Plot parametric norms (Run me)
plt.figure()
for i, lr in enumerate(learning_rates):
  plt.plot([i.cpu().numpy().item() for i in acc_dict[f'param_norm_{i}']],
           label=f'lr={lr:0.2e}')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Parameter norms')
plt.show()

上記のモデルでは、期待とは異なる現象が観察されました。なぜそうなったと思いますか？

セクション5: ハイパーパラメータ調整

所要時間の目安: 約5分

# @title Video 5: Hyperparameter tuning
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'HgkiKRYc-3A'), ('Bilibili', 'BV1E44y127Sn')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_Hyperparameter_tuning_Video")

ハイパーパラメータ調整はしばしば難しく時間がかかりますが、良い一般化性能を得るために重要な工程です。探索をガイドするためにいくつかの手法があります。

グリッドサーチ: すべてのハイパーパラメータの組み合わせを試す
ランダムサーチ: ランダムに異なる組み合わせを試す
座標降下法: あるハイパーパラメータセットから始め、一つずつ変えて検証誤差が減る変更を受け入れる
ベイズ最適化 / Auto ML: 類似問題で効果的だったハイパーパラメータセットから始め、局所探索（例：勾配降下）を行う

探索範囲や最初に最適化するパラメータなど、選択肢は多くあります。ドロップアウトの確率は0.5か0.2のどちらかがよく使われ、それ以外はあまり変わらないことが多いですが、ネットワークのサイズや深さは大きく影響します。類似問題で効果的だった設定を参考にするのが鍵です。

ネットワーク構造の調整を自動化する手法として*ニューラルアーキテクチャサーチ（NAS）*があります。NASは線形層、畳み込み層などのビルディングブロックを用いて新しい構造を設計し、グリッドサーチ、強化学習、勾配降下法、進化的アルゴリズムなど多様な手法で性能を最適化します。これには非常に高い計算リソースが必要です。詳細はこの記事を参照してください。

考えてみよう 5: 正則化手法の総括

今日学んだ正則化手法の中で、ネットワークに最も大きな効果を与えたのはどれだと思いますか？なぜそう思いますか？同じネットワークにすべての正則化手法を適用できますか？

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_Overview_of_regularization_techniques_Discussion")

まとめ

おめでとうございます！NMA-DLの第1週を修了しました！

このチュートリアルでは、L1およびL2正則化、ドロップアウト、データ拡張などの正則化手法を学びました。最後に、SGDの学習率も正則化として働くことを見ました。興味深い論文はこちらにあります。

時間があれば、ボーナス教材の敵対的攻撃もぜひご覧ください！

ボーナス: 敵対的攻撃

所要時間の目安: 約15分

# @title Video 6: Adversarial Attacks
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'LzPPoiKi5jE'), ('Bilibili', 'BV19o4y1X74u')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_Adversarial_Attacks_Bonus_Video")

入力データに摂動を加えて機械学習モデルを騙すことを「敵対的攻撃」と呼びます。これらの攻撃は高次元空間で複雑な決定境界を学習することの避けられない副産物です。用途によっては非常に危険です。

したがって、このような攻撃に対抗できるモデルを構築する必要があります。一つの方法はネットワークを正則化して決定境界を滑らかにすることです。敵対的攻撃に強いモデル構築の方法には以下があります：

防御的蒸留（Defensive Distillation）: 蒸留で訓練されたモデルはソフトラベルで学習し、訓練過程にランダム性があるため攻撃に強い。
特徴圧縮（Feature Squeezing）: 入力を圧縮する前後でモデルの予測を比較し、オンライン分類器の敵対的攻撃を検出する。
SGD: 敵対者が最大化しようとするものを最小化する重みをSGDで選ぶことも可能。

敵対的攻撃についての詳細はこちらをご覧ください。

チュートリアル 2: 正則化手法 パート2

チュートリアルの目的

セットアップ

セクション1: L1およびL2正則化

セクション1.1: 正則化なしモデル

セクション1.2: L1正則化

コーディング演習 1.1: L1正則化

セクション1.3: L2 / リッジ正則化

コーディング演習 1.2: L2正則化

セクション2: ドロップアウト

考えてみよう 2.1!: ドロップアウト

セクション2.1: ドロップアウト実装上の注意点

考えてみよう 2.2! ドロップアウトの注意点

セクション3: データ拡張

考えてみよう 3.1!: データ拡張

考えてみよう 3.2!: 過剰パラメータ化モデル vs 小規模NN

セクション4: 確率的勾配降下法

セクション4.1: 学習率

セクション5: ハイパーパラメータ調整

考えてみよう 5: 正則化手法の総括

まとめ

ボーナス: 敵対的攻撃

チュートリアル 2: 正則化手法パート2