チュートリアル 1: 時系列処理の紹介

第3週、第1日目: 時系列と自然言語処理

Neuromatch Academyによる

コンテンツ制作者: Lyle Ungar, Kelson Shilling-Scrivo, Alish Dipani

コンテンツレビュアー: Kelson Shilling-Scrivo

コンテンツ編集者: Gagana B, Spiros Chavlis, Kelson Shilling-Scrivo

制作編集者: Gagana B, Spiros Chavlis

元コンテンツ: Anushree Hede, Pooja Consul, Ann-Katrin Reuel

チュートリアルの目的

リカレントニューラルネットワーク（RNN）がシーケンスのモデリングに優れている理由を探る前に、他のシーケンスモデリング方法、テキストのエンコード方法、そしてそのようなエンコーディングや埋め込みを使った意味のある測定方法を探ります。

# @title Tutorial slides
from IPython.display import IFrame
link_id = "n263c"
print(f"If you want to download the slides: https://osf.io/download/{link_id}/")
IFrame(src=f"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render", width=854, height=480)

セットアップ

# @title Install dependencies

# @markdown There may be *errors* and/or *warnings* reported during the installation. However, they are to be ignored.

# @title Install and import feedback gadget


from vibecheck import DatatopsContentReviewContainer
def content_review(notebook_section: str):
    return DatatopsContentReviewContainer(
        "",  # No text prompt
        notebook_section,
        {
            "url": "https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab",
            "name": "neuromatch_dl",
            "user_key": "f379rz8y",
        },
    ).render()


feedback_prefix = "W3D1_T1"

# Imports
import time
import nltk
import datasets
import fasttext
import tokenizers
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import brown

import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

# @title Figure Settings
import logging
logging.getLogger('matplotlib.font_manager').disabled = True

import ipywidgets as widgets
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use("https://raw.githubusercontent.com/NeuromatchAcademy/content-creation/main/nma.mplstyle")

# @title  Load Dataset from `nltk`
# No critical warnings, so we suppress it
import warnings
warnings.simplefilter("ignore")

nltk.download('brown')

# @title Helper functions

# Gensim Word2Vec shim using fasttext
class Word2Vec:
   def __init__(self, sentences, vector_size=100, min_count=5, sg=1, workers=1):
     with open("sentences.txt", "w") as f:
       for sentence in sentences:
         f.write(" ".join(sentence) + "\n")
       self.wv = fasttext.train_unsupervised("sentences.txt",
                                             model='skipgram',
                                             dim=vector_size,
                                             minCount=min_count,
                                             thread=workers)
     def most_similar(word, topn=10):
       return [(n[1], n[0]) for n in self.wv.get_nearest_neighbors(word, k=topn)]
     self.wv.most_similar = most_similar

   def __str__(self):
     return self.wv.words

   def __iter__(self):
     for word in self.wv.words:
       yield word

# simple english tokenizer
def get_tokenizer(vocab):
  tokenizer_model = tokenizers.models.WordLevel(vocab, "<unk>")
  tokenizer = tokenizers.Tokenizer(tokenizer_model)
  tokenizer.normalizer = tokenizers.normalizers.BertNormalizer()
  tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.BertPreTokenizer()
  return tokenizer

# @title Set random seed

# @markdown Executing `set_seed(seed=seed)` you are setting the seed

# For DL its critical to set the random seed so that students can have a
# baseline to compare their results to expected results.
# Read more here: https://pytorch.org/docs/stable/notes/randomness.html

# Call `set_seed` function in the exercises to ensure reproducibility.
import random
import torch

def set_seed(seed=None, seed_torch=True):
  """
  Function that controls randomness.
  NumPy and random modules must be imported.

  Args:
    seed : Integer
      A non-negative integer that defines the random state. Default is `None`.
    seed_torch : Boolean
      If `True` sets the random seed for pytorch tensors, so pytorch module
      must be imported. Default is `True`.

  Returns:
    Nothing.
  """
  if seed is None:
    seed = np.random.choice(2 ** 32)
  random.seed(seed)
  np.random.seed(seed)
  if seed_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

  print(f'Random seed {seed} has been set.')

# In case that `DataLoader` is used
def seed_worker(worker_id):
  """
  DataLoader will reseed workers following randomness in
  multi-process data loading algorithm.

  Args:
    worker_id: integer
      ID of subprocess to seed. 0 means that
      the data will be loaded in the main process
      Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details

  Returns:
    Nothing
  """
  worker_seed = torch.initial_seed() % 2**32
  np.random.seed(worker_seed)
  random.seed(worker_seed)

# @title Set device (GPU or CPU). Execute `set_device()`

# Inform the user if the notebook uses GPU or CPU.

def set_device():
  """
  Set the device. CUDA if available, CPU otherwise

  Args:
    None

  Returns:
    Nothing
  """
  device = "cuda" if torch.cuda.is_available() else "cpu"
  if device != "cuda":
    print("WARNING: For this notebook to perform best, "
        "if possible, in the menu under `Runtime` -> "
        "`Change runtime type.`  select `GPU` ")
  else:
    print("GPU is enabled in this notebook.")

  return device

DEVICE = set_device()
SEED = 2021
set_seed(seed=SEED)

セクション 1: イントロ: どんな時系列があるの？

所要時間の目安: 20分

# @title Video 1: Time Series and NLP
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'W4RTRXt7pO0'), ('Bilibili', 'BV1E94y117Nf')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_Time_Series_and_NLP_Video")

# @title Video 2: What is NLP?
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'Q-PGZyaBQVk'), ('Bilibili', 'BV18v4y1M7GF')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_What_is_NLP_Video")

セクション 2: 埋め込み

所要時間の目安: 50分

# @title Video 3: NLP Tokenization
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', 'GLreyXm4rg8'), ('Bilibili', 'BV1ov4y1M7bQ')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_NLP_tokenization_Video")

セクション 2.1: イントロダクション

Word2vec は単語埋め込みを生成する関連モデル群です。これらのモデルは浅い2層のニューラルネットワークで、単語の言語的文脈を再構築するように訓練されます。Word2vecは大量のテキストコーパスを入力として受け取り、コーパス内の各単語に対応するベクトルを割り当てたベクトル空間を生成します。

単語埋め込みの作成

Brownコーパス$のカテゴリの一部に対して埋め込みを作成します。このタスクを達成するために、古いGensim APIに基づきつつFastTextライブラリを用いた互換性のあるWord2Vecクラスを使用します。このクラスは文のシーケンスを入力として期待します。各文は単語のリストです。

Word2vecは訓練速度と品質の両方に影響するいくつかのパラメータを受け入れます。

その一つは内部辞書の剪定用です。10億語のコーパスで1回か2回しか出現しない単語は、おそらく誤字やゴミであり、意味のある訓練データが不足しているため無視するのが最善です：

model = Word2Vec(sentences, min_count=10)  # デフォルトは5

min_countの妥当な値はデータセットのサイズに応じて0から100の間です。

もう一つのパラメータはニューラルネットワーク層のsizeで、訓練アルゴリズムの自由度に対応します：

model = Word2Vec(sentences, size=200)  # デフォルトは100

sizeの値が大きいほど訓練データが多く必要ですが、より良い（より正確な）モデルが得られる可能性があります。妥当な値は数十から数百です。

主要なパラメータの最後は訓練の並列化用で、訓練速度を上げます：

model = Word2Vec(sentences, workers=4)  # デフォルトは1（並列化なし）

# Categories used for the Brown corpus
category = ['editorial', 'fiction', 'government', 'mystery', 'news', 'religion',
            'reviews', 'romance', 'science_fiction']

# @markdown Word2Vec model

def create_word2vec_model(category='news', size=50, sg=1, min_count=5):
  sentences = brown.sents(categories=category)
  model = Word2Vec(sentences, vector_size=size,
                   sg=sg, min_count=min_count)
  return model


def model_dictionary(model):
  print(model.wv)
  words = list(model.wv)
  return words


def get_embedding(word, model):
  if word in model.wv:
    return model.wv[word]
  else:
    return None

このセルの実行には30～45秒かかります。

# Create a word2vec model based on categories from Brown corpus
w2vmodel = create_word2vec_model(category)

辞書内の単語の埋め込みベクトルを取得できます。

# get word list from Brown corpus
brown_wordlist = list(brown.words(categories=category))
# generate a random word
random_word = random.sample(brown_wordlist, 1)[0]
# get embedding of the random word
random_word_embedding = get_embedding(random_word, w2vmodel)
print(f'Embedding of "{random_word}" is {random_word_embedding}')

単語埋め込みの可視化

word2vecを使って辞書内の任意の単語の埋め込みを取得できます。これらの埋め込みが何を意味するのか直感的に理解するために可視化しましょう。word2vecモデルから得られる単語埋め込みは高次元空間にあり、tSNEを使って埋め込みの分散を最もよく捉える2つの特徴を選び、2次元空間に表現します。

keysの各単語について、上位10個の類似単語（コサイン類似度を使用）を選びプロットします。

コードを実行する前に少し考えてみてください：

類似単語はどのように配置されるべきでしょうか？
重要なクラスタの重心（プロット上のXで表現）は、なぜ一部のキー単語（大きなフォントで表現）に近く、他には遠いのでしょうか？

keys = ['voters', 'magic', 'love', 'God', 'evidence', 'administration', 'governments']

# @markdown ### Cluster embeddings related functions

# @markdown **Note:** We import [sklearn.manifold.TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)
from sklearn.manifold import TSNE
import matplotlib.cm as cm

def get_cluster_embeddings(keys):
  embedding_clusters = []
  word_clusters = []

  # find closest words and add them to cluster
  for word in keys:
    embeddings = []
    words = []
    if not word in w2vmodel.wv:
      print(f'The word {word} is not in the dictionary')
      continue

    for similar_word, _ in w2vmodel.wv.most_similar(word, topn=10):
      words.append(similar_word)
      embeddings.append(w2vmodel.wv[similar_word])
    embeddings.append(get_embedding(word, w2vmodel))
    words.append(word)
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

  # get embeddings for the words in clusers
  embedding_clusters = np.array(embedding_clusters)
  n, m, k = embedding_clusters.shape
  tsne_model_en_2d = TSNE(perplexity=10, n_components=2, init='pca', n_iter=3500, random_state=32)
  embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
  return embeddings_en_2d, word_clusters


def tsne_plot_similar_words(title, labels, embedding_clusters,
                            word_clusters, opacity, filename=None):
  plt.figure(figsize=(16, 9))
  colors = cm.rainbow(np.linspace(0, 1, len(labels)))
  for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
    x = embeddings[:, 0]
    y = embeddings[:, 1]
    plt.scatter(x, y, color=color, alpha=opacity, label=label)
    # Plot the cluster centroids
    plt.plot(np.mean(x), np.mean(y), 'x', color=color, markersize=16)
    for i, word in enumerate(words):
      size = 10 if i < 10 else 14
      plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                    textcoords='offset points',
                    ha='right', va='bottom', size=size)
  plt.legend()
  plt.title(title)
  plt.grid(True)
  if filename:
    plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
  plt.show()

# Get closest words to the keys and get clusters of these words
embeddings_en_2d, word_clusters = get_cluster_embeddings(keys)
# tSNE plot of similar words to keys
tsne_plot_similar_words(title='Similar words from Brown Corpus',
                        labels=keys,
                        embedding_clusters=embeddings_en_2d,
                        word_clusters=word_clusters,
                        opacity=0.7,
                        filename='similar_words.png')

考えてみよう！ 2.1: 類似度

2つの単語埋め込み間の類似度が高いとはどういう意味でしょうか？
クラスタの重心（プロットのXで示される）が、なぜ一部のキー単語（大きなフォント）には近く、他のキー単語からは離れているのでしょうか？

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_Similarity_Discussion")

セクション 2.2: 埋め込みの探求

# @title Video 4: Embeddings rule!
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', '7ijjjFpcOwI'), ('Bilibili', 'BV1KN4y1G7sL')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_Embeddings_rule_Video")

# @title Video 5: Distributional Similarity and Vector Embeddings
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', '0vTuEIAnrII'), ('Bilibili', 'BV1sa411W7ks')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_Distributional_Similarity_and_Vector_Embeddings_Video")

単語や形態素のようなサブワード単位は、言語で意味を表現する基本単位です。単語を実数ベクトルにマッピングする技術は単語埋め込みと呼ばれます。

このセクションでは、文脈を考慮しないword2vecに似た事前学習済みのfastText埋め込みを使用します。

埋め込み操作

FastText ライブラリを使って埋め込みを操作してみましょう。まずは「King」という単語の埋め込みを見つけます。

# @markdown ### Download FastText English Embeddings of dimension 100
# @markdown This will take 1-2 minutes to run

import os, zipfile, requests

url = "https://osf.io/2frqg/download"
fname = "cc.en.100.bin.gz"

print('Downloading Started...')
# Downloading the file by sending the request to the URL
r = requests.get(url, stream=True)

# Writing the file to the local file system
with open(fname, 'wb') as f:
  f.write(r.content)
print('Downloading Completed.')

# opening the zip file in READ mode
with zipfile.ZipFile(fname, 'r') as zipObj:
  # extracting all the files
  print('Extracting all the files now...')
  zipObj.extractall()
  print('Done!')
  os.remove(fname)

# Load 100 dimension FastText Vectors using FastText library
ft_en_vectors = fasttext.load_model('cc.en.100.bin')
print(f"Length of the embedding is: {len(ft_en_vectors.get_word_vector('king'))}")
print(f"\nEmbedding for the word King is:\n {ft_en_vectors.get_word_vector('king')}")

単語間の類似度にはコサイン類似度が使われます。類似度は0から1のスカラー値で表され、値が大きいほど類似度が高いことを意味します。

では、「king」に最も類似した単語トップ10を見つけてみましょう。

ft_en_vectors.get_nearest_neighbors("king", 10)  # Most similar by key

単語の類似度

単語間の類似度についてさらに詳しく見ていきます。異なる単語のペアがどれくらい似ているかを確認しましょう。

def cosine_similarity(vec_a, vec_b):
  """Compute cosine similarity between vec_a and vec_b"""
  return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


def getSimilarity(word1, word2):
  v1 = ft_en_vectors.get_word_vector(word1)
  v2 = ft_en_vectors.get_word_vector(word2)
  return cosine_similarity(v1, v2)


print(f"Similarity between the words King and Queen: {getSimilarity('king', 'queen')}")
print(f"Similarity between the words King and Knight: {getSimilarity('king', 'knight')}")
print(f"Similarity between the words King and Rock: {getSimilarity('king', 'rock')}")
print(f"Similarity between the words King and Twenty: {getSimilarity('king', 'twenty')}")

print(f"\nSimilarity between the words Dog and Cat: {getSimilarity('dog', 'cat')}")
print(f"Similarity between the words Ascending and Descending: {getSimilarity('ascending', 'descending')}")
print(f"Similarity between the words Victory and Defeat: {getSimilarity('victory', 'defeat')}")
print(f"Similarity between the words Less and More: {getSimilarity('less', 'more')}")
print(f"Similarity between the words True and False: {getSimilarity('true', 'false')}")

インタラクティブデモ 2.2.1: 単語間の類似度を確認する

# @markdown Type two words and run the cell!
word1 = 'King'  # @param \ {type:"string"}
word2 = 'Frog'  # @param \ {type:"string"}
word_similarity = getSimilarity(word1, word2)
print(f'Similarity between {word1} and {word2}: {word_similarity}')

埋め込みを使うことで、似た文脈で使われる単語を見つけることができます。しかし、単語に複数の意味がある場合はどうなるでしょうか？

同音異義語の類似度

同音異義語とは、綴りや発音は同じでも文脈によって意味が異なる単語のことです。これらの単語がどのように埋め込まれ、異なる文脈でどのような類似度を持つかを探ってみましょう。

#######################     Words with multiple meanings     ##########################
print(f"Similarity between the words Cricket and Insect: {getSimilarity('cricket', 'insect')}")
print(f"Similarity between the words Cricket and Sport: {getSimilarity('cricket', 'sport')}")

# @title Submit your feedback
content_review(f"{feedback_prefix}_Check_similarity_between_words_Interactive_Demo")

インタラクティブデモ 2.2.2: 同音異義語を探る

# @markdown Type the words and run the cell!
# @markdown examples - minute (time/small), pie (graph/food)

word = 'minute'  # @param \ {type:"string"}
context_word_1 = 'time'  # @param \ {type:"string"}
context_word_2 = 'small'  # @param \ {type:"string"}
word_similarity_1 = getSimilarity(word, context_word_1)
word_similarity_2 = getSimilarity(word, context_word_2)
print(f'Similarity between {word} and {context_word_1}: {word_similarity_1}')
print(f'Similarity between {word} and {context_word_2}: {word_similarity_2}')

単語の類推

埋め込み（Embeddings）は単語の類推を見つけるのに使えます。
やってみましょう：

男 : 女 :: 王 : _____
ドイツ : ベルリン :: フランス : _____
葉 : 木 :: 花びら : _____

## Use get_analogies() funnction.
# The words have to be in the order Positive, negative,  Positve

# Man : Woman  ::  King : _____
# Positive=(woman, king), Negative=(man)
print(ft_en_vectors.get_analogies("woman", "man", "king", 1))

# Germany: Berlin :: France : ______
# Positive=(berlin, frannce), Negative=(germany)
print(ft_en_vectors.get_analogies("berlin", "germany", "france", 1))

# Leaf : Tree  ::  Petal : _____
# Positive=(tree, petal), Negative=(leaf)
print(ft_en_vectors.get_analogies("tree", "leaf", "petal", 1))

しかし、いつも上手くいくのでしょうか？

貧困 : 富 :: 病気 : _____
電車 : 乗る :: 馬 : _____

# Poverty : Wealth  :: Sickness : _____
print(ft_en_vectors.get_analogies("wealth", "poverty", "sickness", 1))

# train : board :: horse : _____
print(ft_en_vectors.get_analogies("board", "train", "horse", 1))

# @title Submit your feedback
content_review(f"{feedback_prefix}_Explore_homonyms_Interactive_Demo")

セクション 2.3: 単語埋め込みを用いたニューラルネット

# @title Video 6: Using Embeddings
from ipywidgets import widgets
from IPython.display import YouTubeVideo
from IPython.display import IFrame
from IPython.display import display


class PlayVideo(IFrame):
  def __init__(self, id, source, page=1, width=400, height=300, **kwargs):
    self.id = id
    if source == 'Bilibili':
      src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'
    elif source == 'Osf':
      src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'
    super(PlayVideo, self).__init__(src, width, height, **kwargs)


def display_videos(video_ids, W=400, H=300, fs=1):
  tab_contents = []
  for i, video_id in enumerate(video_ids):
    out = widgets.Output()
    with out:
      if video_ids[i][0] == 'Youtube':
        video = YouTubeVideo(id=video_ids[i][1], width=W,
                             height=H, fs=fs, rel=0)
        print(f'Video available at https://youtube.com/watch?v={video.id}')
      else:
        video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,
                          height=H, fs=fs, autoplay=False)
        if video_ids[i][0] == 'Bilibili':
          print(f'Video available at https://www.bilibili.com/video/{video.id}')
        elif video_ids[i][0] == 'Osf':
          print(f'Video available at https://osf.io/{video.id}')
      display(video)
    tab_contents.append(out)
  return tab_contents


video_ids = [('Youtube', '9ujUgNoPeF0'), ('Bilibili', 'BV1cU4y1Q7Fh')]
tab_contents = display_videos(video_ids, W=854, H=480)
tabs = widgets.Tab()
tabs.children = tab_contents
for i in range(len(tab_contents)):
  tabs.set_title(i, video_ids[i][0])
display(tabs)

# @title Submit your feedback
content_review(f"{feedback_prefix}_Using_Embeddings_Video")

文脈を考慮しない単語埋め込みの学習は比較的安価ですが、多くの人は事前学習済みの単語埋め込みを使います。文脈依存の単語埋め込みを扱った後で、埋め込みを「ファインチューニング」（タスクに合わせて調整）する方法を見ていきます。

ここでは、事前学習済みの FastText 埋め込みを使って IMDB データセットでニューラルネットを訓練してみましょう。

データはレビューとそれに付随する感情ラベルからなり、二値分類タスクです。

コーディング演習 1: シンプルなフィードフォワードネット

線形層のみのバニラニューラルネットを定義してください。次に単語埋め込みの平均をとってレビュー全体の埋め込みを作ります。ニューラルネットはサイズ128の隠れ層を1つ持ちます。

class NeuralNet(nn.Module):
  """ A vanilla neural network. """
  def __init__(self, output_size, hidden_size, embedding_length, word_embeddings):
    """
    Constructs a vanilla Neural Network Instance.

    Args:
      batch_size: Integer
        Specifies probability of dropout hyperparameter
      output_size: Integer
        Specifies the size of output vector
      hidden_size: Integer
        Specifies the size of hidden layer
      embedding_length: Integer
        Specifies the size of the embedding vector
      word_embeddings
        Specifies the weights to create embeddings from
        voabulary.

    Returns:
      Nothing
    """
    super(NeuralNet, self).__init__()

    self.output_size = output_size
    self.hidden_size = hidden_size
    self.embedding_length = embedding_length

    self.word_embeddings = nn.EmbeddingBag.from_pretrained(word_embeddings)
    self.word_embeddings.weight.requiresGrad = False
    self.fc1 = nn.Linear(embedding_length, hidden_size)
    self.fc2 = nn.Linear(hidden_size, output_size)
    self.init_weights()

  def init_weights(self):
      initrange = 0.5
      self.fc1.weight.data.uniform_(-initrange, initrange)
      self.fc1.bias.data.zero_()
      self.fc2.weight.data.uniform_(-initrange, initrange)
      self.fc2.bias.data.zero_()

  def forward(self, inputs, offsets):
    """
    Compute the final labels by taking tokens as input.

    Args:
      inputs: Tensor
        Tensor of tokens in the text

    Returns:
      out: Tensor
        Final prediction Tensor
    """
    embedded = self.word_embeddings(inputs, offsets)  # convert text to embeddings
    #################################################
    # Implement a vanilla neural network
    raise NotImplementedError("Neural Net `forward`")
    #################################################
    # Pass the embeddings through the neural net
    # Use ReLU as the non-linearity
    x = ...
    x = ...
    x = ...
    output = F.log_softmax(x, dim=1)
    return output

解答を見る$

# @title Submit your feedback
content_review(f"{feedback_prefix}_Simple_feed_forward_net_Exercise")

# @markdown ### Helper functions

# @markdown - `train(model, dataloader)`

# @markdown - `evaluate(model,  dataloader)`

# @markdown - `load_dataset(dataset_name, device, seed, batch_size, valid_split)`

# @markdown - `plot_train_val(x, train, val, train_label, val_label, title)`


# Training
import time

def train(model, dataloader):
  model.train()
  total_acc, total_count = 0, 0
  running_loss = 0
  log_interval = 500
  start_time = time.time()

  for idx, (label, text, offsets) in enumerate(dataloader):
    optimizer.zero_grad()
    predicted_label = model(text, offsets)
    loss = criterion(predicted_label, label)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()
    total_acc += (predicted_label.argmax(1) == label).sum().item()
    total_count += label.size(0)
    if idx % log_interval == 0 and idx > 0:
      elapsed = time.time() - start_time
      print(f'| epoch {epoch:3d} | {idx:5d}/{len(dataloader):5d} batches '
            f'| accuracy {total_acc/total_count:8.3f}')

      start_time = time.time()

    running_loss += loss.item()
  return total_acc/total_count, loss


def evaluate(model, dataloader):
  model.eval()
  total_acc, total_count = 0, 0
  running_loss = 0

  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dataloader):
      predicted_label = model(text, offsets)
      loss = criterion(predicted_label, label)
      total_acc += (predicted_label.argmax(1) == label).sum().item()
      total_count += label.size(0)
      running_loss += loss
  return total_acc/total_count, loss


def load_dataset(dataset_name, tokenizer, device='cpu', seed=0, batch_size=32, valid_split=0.7):
  def encode(samples):
    enc = tokenizer.encode_batch(samples["text"])
    return {"ids": [torch.IntTensor(e.ids) for e in enc]}

  def collate_batch(batch):
      label_list, text_list, offsets = [], [], [0]
      for sample in batch:
          label_list.append(sample["label"])
          processed_text = torch.tensor(sample["ids"], dtype=torch.int64)
          text_list.append(processed_text)
          offsets.append(processed_text.size(0))
      label_list = torch.tensor(label_list, dtype=torch.int64)
      offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
      text_list = torch.cat(text_list)
      return label_list.to(device), text_list.to(device), offsets.to(device)

  dataset = datasets.load_dataset(dataset_name)
  dataset = dataset.map(encode, batched=True)

  num_class = len(dataset["train"].features["label"].names)
  splits = dataset["train"].train_test_split(train_size=valid_split, seed=seed)
  train_dataloader = DataLoader(splits["train"], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
  valid_dataloader = DataLoader(splits["test"], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
  test_dataloader = DataLoader(dataset["test"], batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

  return num_class, train_dataloader, valid_dataloader, test_dataloader


# Plotting
def plot_train_val(x, train, val, train_label, val_label, title, ylabel):
  plt.plot(x, train, label=train_label)
  plt.plot(x, val, label=val_label)
  plt.legend()
  plt.xlabel('epoch')
  plt.ylabel(ylabel)
  plt.title(title)
  plt.show()

# @markdown ### Download embeddings

# @markdown This will load 300 dim FastText embeddings.

# @markdown It will take around 3-4 minutes.

# embedding_fasttext = FastText('simple') # used only to load into model
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec"
fname = "fasttext.simple.300d"

print('Downloading Started...')
# Downloading the file by sending the request to the URL
r = requests.get(url, stream=True)

# Writing the file to the local file system
with open(fname, 'wb') as f:
  f.write(r.content)
print('Downloading Completed.')

# load into tensor
with open(fname, "rb") as f:
  lines = f.read().split(b"\n")

vocab_size, dim = lines[0].split(b" ")
fasttext_vectors = torch.zeros(int(vocab_size)+1, int(dim))
fasttext_vocab = dict()
idx = 0
for line in lines[1:-1]:
  entries = line.rstrip().split(b" ")
  word, entries = entries[0], entries[1:]
  fasttext_vectors[idx] = torch.tensor([float(x) for x in entries])
  fasttext_vocab[word.decode()] = idx
  idx += 1
fasttext_vocab["<unk>"] = idx
print("Vectors loaded.")

tokenizer = get_tokenizer(fasttext_vocab)
num_class, train_data, valid_data, test_data = load_dataset(
  "fancyzhx/ag_news", tokenizer, device=DEVICE, seed=1, batch_size=32, valid_split=0.7
)
hidden_size = 128
embedding_length = fasttext_vectors.size(1)  # 300
model = NeuralNet(num_class, hidden_size, embedding_length, fasttext_vectors).to(DEVICE)

# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 64  # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_loss, val_loss = [], []
train_acc, val_acc = [], []

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    accu_train, loss_train = train(model, train_data)
    accu_val, loss_val = evaluate(model, valid_data)
    train_loss.append(loss_train)
    val_loss.append(loss_val)
    train_acc.append(accu_train)
    val_acc.append(accu_val)

    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

print('Checking the results of test dataset.')
accu_test, loss_test = evaluate(model, test_data)
print('test accuracy {:8.3f}'.format(accu_test))

plot_train_val(np.arange(EPOCHS), train_acc, val_acc,
               'training_accuracy', 'validation_accuracy',
               'Neural Net on AG_NEWS text classification', 'accuracy')
plot_train_val(np.arange(EPOCHS), [x.detach().cpu().numpy() for x in train_loss],
               [x.detach().cpu().numpy() for x in val_loss],
               'training_loss', 'validation_loss',
               'Neural Net on AG_NEWS text classification', 'loss')

ag_news_label = {0: "World",
                 1: "Sports",
                 2: "Business",
                 3: "Sci/Tec"}

def predict(text):
  with torch.no_grad():
    text = torch.tensor(tokenizer.encode(text).ids)
    output = model(text, torch.tensor([0]))
    return output.argmax(1).item()

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

model = model.to("cpu")

print(f"This is a {ag_news_label[predict(ex_text_str)]} news")

まとめ

このチュートリアルでは、言語を例に時系列データの処理方法を紹介しました。時系列を処理するには、それらを埋め込みに変換する必要があります。まずテキストの単語をトークン化し、文脈を考慮しない埋め込みまたは文脈依存の埋め込みを作成します。最後に、これらの単語埋め込みをテキスト分類などの応用にどう活用できるかを見ました。

多言語埋め込みについて学びたい場合は、colab$ または kaggle$ のボーナスチュートリアルをご覧ください。ただし、その前にチュートリアル2の完了をお勧めします！