From d5ed1a61e5ec6a9148464ae112e058c1c3ef4882 Mon Sep 17 00:00:00 2001 From: wangshaonan Date: Mon, 8 Apr 2024 22:31:08 -0400 Subject: [PATCH 1/2] Delete tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb --- .../W3D1_Tutorial2.ipynb | 1879 ----------------- 1 file changed, 1879 deletions(-) delete mode 100644 tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb diff --git a/tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb b/tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb deleted file mode 100644 index fd688a6c7..000000000 --- a/tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb +++ /dev/null @@ -1,1879 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "execution": {}, - "id": "view-in-github" - }, - "source": [ - "\"Open   \"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "# Tutorial 2: Natural Language Processing and LLMs\n", - "\n", - "**Week 3, Day 1: Time Series and Natural Language Processing**\n", - "\n", - "**By Neuromatch Academy**\n", - "\n", - "__Content creators:__ Lyle Ungar, Jordan Matelsky, Konrad Kording, Shaonan Wang, Alish Dipani\n", - "\n", - "__Content reviewers:__ Shaonan Wang, Weizhe Yuan, Dalia Nasr, Stephen Kiilu, Alish Dipani, Dora Zhiyu Yang, Adrita Das\n", - "\n", - "__Content editors:__ Konrad Kording, Shaonan Wang\n", - "\n", - "__Production editors:__ Konrad Kording, Spiros Chavlis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Tutorial Objectives\n", - "\n", - "This tutorial provides a comprehensive overview of modern natural language processing (NLP). It introduces two influential NLP architectures, BERT and GPT, along with a detailed exploration of the underlying NLP pipeline. Participants will learn about the core concepts, functionalities, and applications of these architectures, as well as gain insights into prompt engineering and the current and future developments of GPT." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Tutorial slides\n", - "from IPython.display import IFrame\n", - "link_id = \"spuj8\"\n", - "print(f\"If you want to download the slides: https://osf.io/download/{link_id}/\")\n", - "IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render\", width=854, height=480)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Install dependencies\n", - "# @markdown **WARNING**: There may be *errors* and/or *warnings* reported during the installation. However, they are to be ignored.\n", - "!pip3 install gensim==4.3.1 --quiet\n", - "!pip3 install pytorch_lightning --quiet\n", - "!pip3 install typing_extensions --quiet\n", - "!pip install accelerate --quiet\n", - "!pip3 install datasets --quiet\n", - "!pip3 install transformers==4.28.0 --quiet\n", - "!pip3 install evaluate --quiet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Install and import feedback gadget\n", - "\n", - "!pip3 install vibecheck datatops --quiet\n", - "\n", - "from vibecheck import DatatopsContentReviewContainer\n", - "def content_review(notebook_section: str):\n", - " return DatatopsContentReviewContainer(\n", - " \"\", # No text prompt\n", - " notebook_section,\n", - " {\n", - " \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n", - " \"name\": \"neuromatch_dl\",\n", - " \"user_key\": \"f379rz8y\",\n", - " },\n", - " ).render()\n", - "\n", - "\n", - "feedback_prefix = \"W3D1_T2\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "both", - "execution": {} - }, - "outputs": [], - "source": [ - "# Imports\n", - "import random\n", - "import numpy as np\n", - "from typing import Iterable, List\n", - "from tqdm.notebook import tqdm\n", - "from typing import Dict\n", - "import pytorch_lightning as pl\n", - "\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import DataLoader, Dataset\n", - "from tokenizers import Tokenizer, Regex, models, normalizers, pre_tokenizers, trainers, processors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Set random seed\n", - "\n", - "# @markdown Executing `set_seed(seed=seed)` you are setting the seed\n", - "\n", - "# for DL its critical to set the random seed so that students can have a\n", - "# baseline to compare their results to expected results.\n", - "# Read more here: https://pytorch.org/docs/stable/notes/randomness.html\n", - "\n", - "# Call `set_seed` function in the exercises to ensure reproducibility.\n", - "import random\n", - "import numpy as np\n", - "\n", - "def set_seed(seed=None):\n", - " if seed is None:\n", - " seed = np.random.choice(2 ** 32)\n", - " random.seed(seed)\n", - " np.random.seed(seed)\n", - " print(f'Random seed {seed} has been set.')\n", - "\n", - "\n", - "set_seed(seed=2023) # change 2023 with any number you like" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Set device (GPU or CPU). Execute `set_device()`\n", - "\n", - "# Inform the user if the notebook uses GPU or CPU.\n", - "\n", - "def set_device():\n", - " \"\"\"\n", - " Set the device. CUDA if available, CPU otherwise\n", - "\n", - " Args:\n", - " None\n", - "\n", - " Returns:\n", - " Nothing\n", - " \"\"\"\n", - " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - " if device != \"cuda\":\n", - " print(\"WARNING: For this notebook to perform best, \"\n", - " \"if possible, in the menu under `Runtime` -> \"\n", - " \"`Change runtime type.` select `GPU` \")\n", - " else:\n", - " print(\"GPU is enabled in this notebook.\")\n", - "\n", - " return device" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "DEVICE = set_device()\n", - "SEED = 2021\n", - "set_seed(seed=SEED)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "\n", - "# Section 1: NLP architectures\n", - "\n", - "From RNN/LSTM to Transformers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Video 1: Intro to NLPs and LLMs\n", - "from ipywidgets import widgets\n", - "from IPython.display import YouTubeVideo\n", - "from IPython.display import IFrame\n", - "from IPython.display import display\n", - "\n", - "\n", - "class PlayVideo(IFrame):\n", - " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", - " self.id = id\n", - " if source == 'Bilibili':\n", - " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", - " elif source == 'Osf':\n", - " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", - " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", - "\n", - "\n", - "def display_videos(video_ids, W=400, H=300, fs=1):\n", - " tab_contents = []\n", - " for i, video_id in enumerate(video_ids):\n", - " out = widgets.Output()\n", - " with out:\n", - " if video_ids[i][0] == 'Youtube':\n", - " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", - " height=H, fs=fs, rel=0)\n", - " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", - " else:\n", - " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", - " height=H, fs=fs, autoplay=False)\n", - " if video_ids[i][0] == 'Bilibili':\n", - " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", - " elif video_ids[i][0] == 'Osf':\n", - " print(f'Video available at https://osf.io/{video.id}')\n", - " display(video)\n", - " tab_contents.append(out)\n", - " return tab_contents\n", - "\n", - "\n", - "video_ids = [('Youtube', 'PCz527-WbxY'), ('Bilibili', 'BV15V4y1a7Xu')]\n", - "tab_contents = display_videos(video_ids, W=854, H=480)\n", - "tabs = widgets.Tab()\n", - "tabs.children = tab_contents\n", - "for i in range(len(tab_contents)):\n", - " tabs.set_title(i, video_ids[i][0])\n", - "display(tabs)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "A core principle of Natural Language Processing is embedding words as vectors. In the relevant vector space, words with similar meanings are close to one another.\n", - "\n", - "In classical transformer systems, a core principle is encoding and decoding. We can encode an input sequence as a vector (that implicitly codes what we just read). And we can then take this vector and decode it, e.g., as a new sentence. So a sequence-to-sequence (e.g., sentence translation) system may read a sentence (made out of words embedded in a relevant space) and encode it as an overall vector. It then takes the resulting encoding of the sentence and decodes it into a translated sentence.\n", - "\n", - "In modern transformer systems, such as GPT, all words are used parallelly. In that sense, the transformers generalize the encoding/decoding idea. Examples of this strategy include all the modern large language models (such as GPT)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_Intro_to_NLPs_and_LLMs_Video\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Section 2: The NLP pipeline\n", - "\n", - "Tokenize, pretrain, fine-tune" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Video 2: NLP pipeline\n", - "from ipywidgets import widgets\n", - "from IPython.display import YouTubeVideo\n", - "from IPython.display import IFrame\n", - "from IPython.display import display\n", - "\n", - "\n", - "class PlayVideo(IFrame):\n", - " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", - " self.id = id\n", - " if source == 'Bilibili':\n", - " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", - " elif source == 'Osf':\n", - " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", - " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", - "\n", - "\n", - "def display_videos(video_ids, W=400, H=300, fs=1):\n", - " tab_contents = []\n", - " for i, video_id in enumerate(video_ids):\n", - " out = widgets.Output()\n", - " with out:\n", - " if video_ids[i][0] == 'Youtube':\n", - " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", - " height=H, fs=fs, rel=0)\n", - " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", - " else:\n", - " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", - " height=H, fs=fs, autoplay=False)\n", - " if video_ids[i][0] == 'Bilibili':\n", - " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", - " elif video_ids[i][0] == 'Osf':\n", - " print(f'Video available at https://osf.io/{video.id}')\n", - " display(video)\n", - " tab_contents.append(out)\n", - " return tab_contents\n", - "\n", - "\n", - "video_ids = [('Youtube', 'uPnTVbc4qUE'), ('Bilibili', 'BV1TM4y1E7ab')]\n", - "tab_contents = display_videos(video_ids, W=854, H=480)\n", - "tabs = widgets.Tab()\n", - "tabs.children = tab_contents\n", - "for i in range(len(tab_contents)):\n", - " tabs.set_title(i, video_ids[i][0])\n", - "display(tabs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_NLP_pipeline_Video\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Tokenizers\n", - "\n", - "Today we will practise embedding techniques, and continue our march toward large language models and transformers by discussing one of the critical developments of the modern NLP stack: **Tokenization.** Tokenizers convert inputs as a set of discrete tokens.\n", - "\n", - "### Learning Goals\n", - "\n", - "* Understand the concept of tokenization and why it is useful.\n", - "* Learn how to write a tokenizer from scratch, taking advantage of context.\n", - "* Get an intuition for how modern tokenizers work by playing with a few pre-trained tokenizers from industry." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Generating a dataset\n", - "\n", - "As we continue to move closer to \"production-grade\" NLP, we'll start to use industry standards such as the [HuggingFace](https://huggingface.co/) library. Huggingface is a large company that facilitates the exchange of aspects of modern deep learning systems.\n", - "\n", - "We'll start by generating a training dataset. `hf` has a convenient `datasets` module that allows us to download a variety of datasets, including the [Wikipedia text corpus](https://huggingface.co/datasets/wiki_text). We'll use this to generate a dataset of text from Wikipedia." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "\n", - "dataset = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=\"train\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "print(dataset[41492])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "def generate_n_examples(dataset, n=512):\n", - " \"\"\"\n", - " Produce a generator that yields n examples at a time from the dataset.\n", - " \"\"\"\n", - " for i in range(0, len(dataset), n):\n", - " yield dataset[i:i + n]['text']" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Now we will create the actual `Tokenizer`, adhering to the [`hf.Tokenizer` protocol](https://huggingface.co/docs/transformers/main_classes/tokenizer). (Adhering to a standard protocol enables us to swap in our tokenizer for any tokenizer in the huggingface ecosystem or to apply our own tokenizer to any model in the huggingface ecosystem.)\n", - "\n", - "Let's sketch out the steps of writing a Tokenizer. We need to solve two problems:\n", - "\n", - "* Given a string, split it into a list of tokens.\n", - "* If you don't recognize a word, still figure out a way to tokenize it!\n", - "\n", - "This may feel like we're reinventing our one-hot encoder with a richer vocabulary. Why is it that the One-Hot-Encoder, which outputs a vector of length $|V|$, where $|V|$ is the size of our vocabulary, is not sufficient, but a tokenizer that outputs a list of indices into a vocabulary of size $|V|$ is sufficient? The answer is that while our encoder was responsible for embedding words into a high-dimensional space, our tokenizer is NOT; the \"win\" of a tokenizer is that it breaks up a string into in-vocab elements. For certain workflows, the very next step might be adding an embedder onto the end of the tokenizer. (As we'll soon see, this is exactly the strategy employed by modern Transformer models.)\n", - "\n", - "Tokens will almost always be different from words; for example, we might want to split \"don't\" into \"do\" and \"n't\", or we might want to split \"don't\" into \"do\" and \"not\". Or we might even want to split \"don't\" into \"d\", \"o\", \"n\", and \"t\". We can choose any strategy we want here; **, unlike Word2Vec, our tokenizer will NOT be limited to outputting one vector per English word.** Here, we'll use an off-the-shelf subword splitter, which we discuss below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "VOCAB_SIZE = 12_000" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# Create a tokenizer object that uses the \"WordPiece\" model. The WorkPiece model\n", - "# is a subword tokenizer that uses a vocabulary of common words and word pieces\n", - "# to tokenize text. The \"unk_token\" parameter specifies the token to use for\n", - "# unknown tokens, i.e. tokens that are not in the vocabulary. (Remember that the\n", - "# vocabulary will be built from our dataset, so it will include subchunks of\n", - "# English words.)\n", - "tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Tokenizer Features\n", - "\n", - "Now let's start dressing up our tokenizer with some useful features. First, let's clean up the text. This process is formally called \"normalization\" and is a critical step in any NLP pipeline. We'll remove punctuation and then convert all the text to lowercase. We'll also remove diacritics (accents) from the text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# Think of a Normalizer Sequence the same way you would think of a PyTorch\n", - "# Sequential model. It is a sequence of normalizers that are applied to the\n", - "# text before tokenization, in the order that they are added to the sequence.\n", - "\n", - "tokenizer.normalizer = normalizers.Sequence([\n", - " normalizers.Replace(Regex(r\"[\\s]\"), \" \"), # Convert all whitespace to single space\n", - " normalizers.Lowercase(), # Convert all text to lowercase\n", - " normalizers.NFD(), # Decompose all characters into their base characters\n", - " normalizers.StripAccents(), # Remove all accents\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Next, we'll add a pre-tokenizer. The pre-tokenizer is applied to the text after normalizing it but before it's tokenized. The pre-tokenizer is useful for splitting text into chunks, which are easier to tokenize. For example, we can split text into chunks separated by punctuation or whitespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "tokenizer.pre_tokenizer = pre_tokenizers.Sequence([\n", - " pre_tokenizers.WhitespaceSplit(), # Split on whitespace\n", - " pre_tokenizers.Digits(individual_digits=True), # Split digits into individual tokens\n", - " pre_tokenizers.Punctuation(), # Split punctuation into individual tokens\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note:** In practice, it is not necessary to use pre-tokenizers, but we use it for demonstration purposes. For instance, \"2-3\" is not the same as \"23\", so removing punctuation or splitting up digits or punctuation is a bad idea! Moreover, the current tokenizer is powerful enough to deal with punctuation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Finally, we'll train the tokenizer with our dataset. After all, we want a tokenizer that works well on this dataset. There are a few different algorithms for training tokenizers. Here are two common ones:\n", - "\n", - "* BPE Algorithm: Start with a vocabulary of each character in the dataset. Examine all pairs from the vocabulary and merge the pair with the highest frequency in the dataset. Repeat until the vocabulary size is reached (so \"ee\" is more likely to get merged than \"zf\" in the English corpus).\n", - "* Top-Down WordPiece Algorithm: Generate all substrings of each word from the dataset and count occurrences in the training data. Keep any string that occurs more than a threshold number of times. Repeat this process until the vocabulary size is reached (For a more thorough explanation of this process, see [the TensorFlow Guide](https://www.tensorflow.org/text/guide/subwords_tokenizer#optional_the_algorithm))\n", - "\n", - "We'll use WordPiece in the next cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "tokenizer_trainer = trainers.WordPieceTrainer(\n", - " vocab_size=VOCAB_SIZE,\n", - " # We have to specify the special tokens that we want to use. These will be\n", - " # added to the vocabulary no matter what the vocab-building algorithm does.\n", - " special_tokens=[\"[PAD]\", \"[UNK]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"],\n", - " show_progress=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Special Tokens\n", - "\n", - "Tokenizers often have special tokens representing certain concepts such as:\n", - "* [PAD]: Added to the end of shorter input sequences to ensure equal input length for the whole batch\n", - "* [START]: Start of the sequence\n", - "* [END]: End of the sequence\n", - "* [UNK]: Unknown characters not present in the vocabulary\n", - "* [BOS]: Beginning of sentence\n", - "* [EOS]: End of sentence\n", - "* [SEP]: Separation between two sentences in a sequence\n", - "* [CLS]: Token used for classification tasks to represent the whole sequence\n", - "* [MASK]: Used in pre-training phase for masked language modeling tasks in models like BERT" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Those special tokens are important because it tells the WordPiece training process how to treat phrases, masks, and unknown tokens.\n", - "\n", - "**Note:** We can also add our own special tokens, such as `[CITE]`, to indicate when a citation is about to be used if we want to train a model to predict the presence of citations in a text. Training this will take a bit of time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "sample_ratio = 0.2\n", - "keep = int(len(dataset)*sample_ratio)\n", - "dataset_small = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=f\"train[:{keep}]\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "tokenizer.train_from_iterator(generate_n_examples(dataset_small), trainer=tokenizer_trainer, length=len(dataset_small))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# In \"real life\", we'd probably want to save the tokenizer to disk so that we\n", - "# can use it later. We can do this with the \"save\" method:\n", - "# tokenizer.save(\"tokenizer.json\")\n", - "\n", - "# Let's try it out!\n", - "print(\"Hello, world!\")\n", - "print(\n", - " *zip(\n", - " tokenizer.encode(\"Hello, world!\").tokens,\n", - " tokenizer.encode(\"Hello, world!\").ids,\n", - " )\n", - ")\n", - "\n", - "\n", - "# Can we also tokenize made-up words?\n", - "print(tokenizer.encode(\"These toastersocks are so groommpy!\").tokens)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "(The `##` means that the token is a continuation of the previous chunk.)\n", - "\n", - "Try playing around with the hyperparameters and the tokenizing algorithms to see how they affect the tokenizer's output. There can be some very major differences!\n", - "\n", - "In summary, we created a tokenizer pipeline that:\n", - "\n", - "* Normalizes the text (cleans up punctuation and diacritics)\n", - "* Splits the text into chunks (using whitespace and punctuation)\n", - "* Trains the tokenizer on the dataset (using the WordPiece algorithm)\n", - "\n", - "In common use, this would be the first step of any modern NLP pipeline. The next step would be to add an embedder to the end of the tokenizer, so that we can feed in a high-dimensional space to our model. But unlike Word2Vec, we can now separate the tokenization step from the embedding step, which means our encoding/embedding process can be task-specific, custom to our downstream neural net architecture, instead of general-purpose." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "### Think 2.1! Tokenizer good practices\n", - "\n", - "We established that the tokenizer is a better move than the One-Hot-Encoder because it can handle out-of-vocabulary words. But what if we just made a one-hot encoding where the vocabulary is all possible two-character combinations? Would there still be an advantage to the tokenizer?\n", - "\n", - "**Hint:** Re-read the section on the BPE and WordPiece algorithms, and how the tokens are selected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# to_remove explanation\n", - "\n", - "\"\"\"\n", - "If we used a one-hot encoding where the vocabulary is all possible two-character\n", - "combinations, we would still face some problems that the tokenizer can solve.\n", - "Here are some of them:\n", - "\n", - "* The vocabulary size would be very large, since there are 26^2 = 676 possible\n", - "two-character combinations in English. This would make the one-hot vectors\n", - "very sparse and high-dimensional, which can affect the efficiency and\n", - "performance of the model.\n", - "* The one-hot encoding would not capture any semantic or syntactic information\n", - "about the words, since each two-character combination would be treated as an\n", - "independent unit. This would make it harder for the model to learn meaningful\n", - "representations of the words and their contexts.\n", - "* The one-hot encoding would not handle rare or unseen words well, since\n", - "it would either ignore them or assign them to a generic unknown token.\n", - "This would limit the generalization ability of the model and reduce its\n", - "accuracy on new data.\n", - "\n", - "\n", - "The tokenizer, on the other hand, can overcome these problems by using subword\n", - "units that are based on the frequency and co-occurrence of characters\n", - "in the corpus. The tokenizer can:\n", - "\n", - "* Reduce the vocabulary size by merging frequent and meaningful subword units\n", - "into larger tokens. For example, instead of having separate tokens\n", - "for “in”, “ing”, “tion”, etc., the tokenizer can merge them into a single token\n", - "that represents a common suffix.\n", - "* Capture some semantic and syntactic information about the words, since the\n", - "subword units are derived from the data and reflect how words are composed and\n", - "used. For example, the tokenizer can split a word like “unhappy” into “un” and\n", - "“happy”, which preserves some information about its meaning and structure.\n", - "* Handle rare or unseen words better, since it can split them into smaller\n", - "subword units that are likely to be in the vocabulary. For example, if the word\n", - "“neural” is not in the vocabulary, the tokenizer can split it into “neu” and\n", - "“ral”, which are more likely to be seen in other words.\n", - "\n", - "Therefore, there is still an advantage to using the tokenizer over the\n", - "one-hot encoding, even if we use all possible two-character combinations\n", - "as the vocabulary. The tokenizer can create more compact, informative, and\n", - "flexible representations of words that can improve the performance of the model.\n", - "\"\"\";" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_Tokenizer_good_practices_Discussion\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "### Think 2.2: Chinese and English tokenizer\n", - "\n", - "Let's think about a language like Chinese, where words are each composed of a relatively fewer number of characters compared to English (`hungry` is six unicode characters, but `饿` is one unicode character), but there are many more unique Chinese characters than there are letters in the English alphabet.\n", - "\n", - "In a one or two sentence high-level sketch, what properties would be desireable for a Chinese tokenizer to have?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# to_remove explanation\n", - "\n", - "\"\"\"\n", - "For instance, it should be able to segment words based on the meaning and usage\n", - "of the characters, rather than relying on spaces or punctuation.\n", - "For example, it should recognize that “北京” is a single word meaning “Beijing”,\n", - "rather than two separate characters.\n", - "\"\"\";" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_Chinese_and_English_tokenizer_Discussion\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Section 3: Using BERT\n", - "\n", - "In this section, we will learn about using the BERT model from huggingface.\n", - "\n", - "## Learning Goals\n", - "* Understand the idea behind BERT\n", - "* Understand the idea of pre-training and fine-tuning\n", - "* Understand how freezing parts of the network is useful" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Video 3: BERT\n", - "from ipywidgets import widgets\n", - "from IPython.display import YouTubeVideo\n", - "from IPython.display import IFrame\n", - "from IPython.display import display\n", - "\n", - "\n", - "class PlayVideo(IFrame):\n", - " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", - " self.id = id\n", - " if source == 'Bilibili':\n", - " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", - " elif source == 'Osf':\n", - " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", - " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", - "\n", - "\n", - "def display_videos(video_ids, W=400, H=300, fs=1):\n", - " tab_contents = []\n", - " for i, video_id in enumerate(video_ids):\n", - " out = widgets.Output()\n", - " with out:\n", - " if video_ids[i][0] == 'Youtube':\n", - " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", - " height=H, fs=fs, rel=0)\n", - " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", - " else:\n", - " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", - " height=H, fs=fs, autoplay=False)\n", - " if video_ids[i][0] == 'Bilibili':\n", - " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", - " elif video_ids[i][0] == 'Osf':\n", - " print(f'Video available at https://osf.io/{video.id}')\n", - " display(video)\n", - " tab_contents.append(out)\n", - " return tab_contents\n", - "\n", - "\n", - "video_ids = [('Youtube', 'u4D-84Z1Fxs'), ('Bilibili', 'BV17u411b7gJ')]\n", - "tab_contents = display_videos(video_ids, W=854, H=480)\n", - "tabs = widgets.Tab()\n", - "tabs.children = tab_contents\n", - "for i in range(len(tab_contents)):\n", - " tabs.set_title(i, video_ids[i][0])\n", - "display(tabs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_BERT_Video\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "# Section 4: NLG with GPT\n", - "\n", - "In this section we will learn about Natural Language Generation with Generative Pretrained Transformers.\n", - "\n", - "## Learning goals\n", - "* How to produce language with GPTs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Video 4: NLG\n", - "from ipywidgets import widgets\n", - "from IPython.display import YouTubeVideo\n", - "from IPython.display import IFrame\n", - "from IPython.display import display\n", - "\n", - "\n", - "class PlayVideo(IFrame):\n", - " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", - " self.id = id\n", - " if source == 'Bilibili':\n", - " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", - " elif source == 'Osf':\n", - " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", - " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", - "\n", - "\n", - "def display_videos(video_ids, W=400, H=300, fs=1):\n", - " tab_contents = []\n", - " for i, video_id in enumerate(video_ids):\n", - " out = widgets.Output()\n", - " with out:\n", - " if video_ids[i][0] == 'Youtube':\n", - " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", - " height=H, fs=fs, rel=0)\n", - " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", - " else:\n", - " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", - " height=H, fs=fs, autoplay=False)\n", - " if video_ids[i][0] == 'Bilibili':\n", - " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", - " elif video_ids[i][0] == 'Osf':\n", - " print(f'Video available at https://osf.io/{video.id}')\n", - " display(video)\n", - " tab_contents.append(out)\n", - " return tab_contents\n", - "\n", - "\n", - "video_ids = [('Youtube', 'vwFMHitq-FY'), ('Bilibili', 'BV1Hu411b7dx')]\n", - "tab_contents = display_videos(video_ids, W=854, H=480)\n", - "tabs = widgets.Tab()\n", - "tabs.children = tab_contents\n", - "for i in range(len(tab_contents)):\n", - " tabs.set_title(i, video_ids[i][0])\n", - "display(tabs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_NLG_Video\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Using state-of-the-art (SOTA) Models\n", - "\n", - "Unless you are writing your own experimental DL research (and sometimes even then!) it is _far_ more common these days to use the HuggingFace model library to import and start working with state-of-the-art models quickly. In this section, we will show you how to do that.\n", - "\n", - "We will download a pretrained model from the hf `transformers` library that is used to generate text. We will then fine-tune it on a different dataset, using the `hf.datasets` library and the HuggingFace Trainer classes to make the process as easy as possible, and we'll see that we can accomplish all of this in just a few lines of easily maintained code.\n", - "\n", - "Ultimately, we will have a _working_ generator... for code!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We're first going to pick a tokenizer. You can see some of the options [here](https://huggingface.co/transformers/pretrained_models.html). We'll use CodeParrot tokenizer, which is a BPE tokenizer. But you can choose (or build!) another if you'd like to try offroading!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "from transformers import AutoTokenizer\n", - "from datasets import load_dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "tokenizer = AutoTokenizer.from_pretrained(\"codeparrot/codeparrot-small\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "### Think 4.1! Tokenizers\n", - "\n", - "Why can you use a different tokenizer than the one that was originally used? What requirements must another tokenizer for this task have?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# to_remove explanation\n", - "\n", - "\"\"\"\n", - "You couldn't, for example, use the very popular `bert-base-uncased` tokenizer,\n", - "even though it's a popular choice for text generation tasks that were trained\n", - "on the English Wikipedia and the BookCorpus datasets (which are both available\n", - "in the `hf.datasets` library).\n", - "\"\"\";" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_Tokenizers_Discussion\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Next, we'll download a pre-built model architecture. CodeParrot (the model) is a GPT-2 model, which is a transformer-based language model. You can see some of the options [here](https://huggingface.co/transformers/pretrained_models.html). But you can choose (or build!) another!\n", - "\n", - "Note that `codeparrot/codeparrot` (https://huggingface.co/codeparrot/codeparrot) is about 7GB to download (so it may take a while, or it may be too large for your runtime if you're on a free Colab). Instead, we will use a smaller model, `codeparrot/codeparrot-small` (https://huggingface.co/codeparrot/codeparrot-small), which is only ~500MB.\n", - "\n", - "To run everything together — tokenization, model, and de-tokenization, we can use the `pipeline` function from `transformers`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "from transformers import AutoModelWithLMHead\n", - "from transformers import pipeline\n", - "\n", - "model = AutoModelWithLMHead.from_pretrained(\"codeparrot/codeparrot-small\")\n", - "generation_pipeline = pipeline(\n", - " \"text-generation\", # The task to run. This tells hf what the pipeline steps are\n", - " model=model, # The model to use; can also pass the string here;\n", - " tokenizer=tokenizer, # The tokenizer to use; can also pass the string name here.\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "input_prompt = '''\\\n", - "def simple_add(a: int, b: int) -> int:\n", - " \"\"\"\n", - " Adds two numbers together and returns the result.\n", - " \"\"\"'''\n", - "\n", - "# Return tensors for PyTorch:\n", - "inputs = tokenizer(input_prompt, return_tensors=\"pt\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Recall that these tokens are integer indices in the vocabulary of the tokenizer. We can use the tokenizer to decode these tokens into a string, which we can print out to see what the model generates." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "input_token_ids = inputs[\"input_ids\"]\n", - "input_strs = tokenizer.convert_ids_to_tokens(*input_token_ids.tolist())\n", - "\n", - "print(*zip(input_strs, input_token_ids[0]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "**(Quick knowledge-check: what are the weirdly-rendering characters representing?)**\n", - "\n", - "This model is already ready to use! Let's give it a try. (Note that we don't use `inputs` — we just generated that to show the initial tokenization steps.)\n", - "\n", - "Here, we use the `pipeline` we created earlier to combine all our components. If you were writing a Copilot-style code-completer, you could get away with wrapping this single line in a nice API and calling it a day!\n", - "\n", - "Play with the hyperparameters and see what kinds of outputs you can get. Temperature measures how much randomness is added to the model's predictions. Higher temperature means more randomness and lower temperature means less randomness. More randomness in the latent space will lead to wilder predictions and potentially more creative answers. A good place to start is `0.2`. You can also try changing the `max_length` parameter, which controls how long the generated code can be (though the model can opt to put a \"stop\" token in the middle of the sequence, so it may not always generate exactly this many tokens)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "outputs = generation_pipeline(input_prompt, max_length=100, num_return_sequences=1, temperature=0.2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "print(outputs[0][\"generated_text\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Let's see if we can fool our model now! The huggingface documentation tells us that the codeparrot model was trained to generate Python code ([docs](https://huggingface.co/codeparrot/codeparrot-small)). Let's see if we can get it to generate some JavaScript." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "input_prompt = \"class SimpleAdder {\"\n", - "\n", - "print(generation_pipeline(input_prompt, max_length=100, num_return_sequences=1, temperature=0.2)[0][\"generated_text\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Yikes! I don't know what it generated for you, but what it made for me was:\n", - "\n", - "```python\n", - "class SimpleAdder {\n", - " public:\n", - " class SimpleAdder(object):\n", - " def __init__(self, a, b):\n", - " self.a = a\n", - " self.b = b\n", - "\n", - " def __call__(self, x):\n", - " return self.a + x\n", - "```\n", - "\n", - "**Ew!** That's wrong in a _lot_ of ways. But it's understandable: Our model can't really generalize outside of the domain in which it was trained. And so probably there were a few Python files that included syntax of other languages (perhaps generators for other code?). So the model knows that there's some mysterious syntax that uses curly brackets... But it's not sure about anything else. (For the programming-language hobbyists among you: The `public` notation looks to me a lot like the model is trying to do something C-flavored and perhaps something Java-flavored; I like it! But it's definitely not JavaScript.)\n", - "\n", - "What are the major observations?\n", - "\n", - "* The syntax it's generating rapidly and devolves into Python; it can predict only a few characters of non-Python before falling back into its familiar training territory.\n", - "* The part of the code that follows Python syntax is valid and resembles a useful class definition (although if you look closely, it doesn't seem to do anything useful with the `b` attribute...). This tells us that the model \"understands\" its problem domain but hasn't been trained on the correct data to solve our new problem." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "### Think 4.2! Using SOTA models\n", - "\n", - "What are your other observations about the code it generated for you? You're now aware of how Transformers work.\n", - "\n", - "1. Think specifically and remark about the observations a machine learning practitioner would make here if your role were to diagnose the error in a production system.\n", - "2. Now, how would a nonexpert user interpret the issues?\n", - "3. Do you think the model-reported confidence for this output would be high, low, or in between...?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# to_remove explanation\n", - "\n", - "\"\"\"\n", - "Here is one possible answer.\n", - "1. The model is not well-trained or fine-tuned on the task of generating Python\n", - "code from natural language instructions. It may have insufficient data,\n", - "low quality data, or inappropriate hyperparameters.\n", - "2. The model is not smart or reliable enough to write code for them.\n", - "It may have bugs, glitches, or limitations that prevent it from working properly.\n", - "3. I think the model-reported confidence for this output would be low, since\n", - "the output has many errors and deviations from the instructions. However, the\n", - "confidence may also depend on how the model is trained and calibrated, and how\n", - "it estimates its own uncertainty and quality.\n", - "\"\"\";" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_Using_SOTA_models_Discussion\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Fine-Tuning\n", - "\n", - "Alright, so we have a model that can generate code. But now, we want to fine-tune it to generate JavaScript.\n", - "\n", - "Assuming the data will be too large to fit on disk on Colab, we'll use the `load_dataset` function to download only part of the dataset. There's a JavaScript subset to the codeparrot dataset, which we'll use as an example… But you can use any dataset you like! We recommend filtering datasets by task category (e.g., text generation) to get the most relevant datasets. Still, you can use any dataset you like if you can configure the data loader to use it. (Consider, for example, [this one](https://huggingface.co/datasets/angie-chen55/javascript-github-code).)\n", - "\n", - "> **Choose a dataset from the [HuggingFace datasets library](https://huggingface.co/datasets?task_categories=task_categories:text-generation&sort=downloads).**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# Unlike _some_ code-generator models on the market, we'll limit our training data by license :)\n", - "dataset = load_dataset(\n", - " \"codeparrot/github-code\",\n", - " streaming=True,\n", - " split=\"train\",\n", - " languages=[\"JavaScript\"],\n", - " licenses=[\"mit\", \"isc\", \"apache-2.0\"],\n", - ")\n", - "# Print the schema of the first example from the training set:\n", - "print({k: type(v) for k, v in next(iter(dataset)).items()})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Like training any model, we need to define a training loop and an evaluation metric.\n", - "\n", - "This is made overwhelmingly easy with the `transformers` library. Specifically, look below at all of the code you can avoid using the huggingface infrastructure. (In the past, we've used PyTorch Lightning, which had a similar training-loop abstraction. Do you have preferences between these two libraries?)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "### Implement the code to fine-tune the model\n", - "\n", - "Here are the big pieces of what we do below:\n", - "\n", - "* **Create a `TrainingArguments` object.** This serializable object (i.e., you can save it to memory or disk) makes it easy to train a model reproducibly with the same hyperparameters (this certainly beats having a bunch of global variables in your notebook!).\n", - "* **Encode the dataset.** This is effectively just passing everything through the tokenizer, with a padding step that fills the end of each sequence with the padding token.\n", - "* **Define our metrics.** We use the `accuracy` metric here (look at the 4th line in the code cell).\n", - "* **Create a data collator.** This function takes a list of examples and returns a batch of examples. The `DataCollatorForLanguageModeling` class is a convenient way to do this.\n", - "* **Create a `Trainer` object.** This class wraps the training loop and makes it easy to train a model. It's a bit like the `Trainer` class in PyTorch Lightning, but it's a bit more flexible and works with non-PyTorch models as well." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling\n", - "from evaluate import load\n", - "metric = load(\"accuracy\")\n", - "\n", - "# Trainer:\n", - "training_args = TrainingArguments(\n", - " output_dir=\"./codeparrot\",\n", - " max_steps=100,\n", - " per_device_train_batch_size=1,\n", - ")\n", - "\n", - "tokenizer.pad_token = tokenizer.eos_token\n", - "\n", - "encoded_dataset = dataset.map(\n", - " lambda x: tokenizer(x[\"code\"], truncation=True, padding=\"max_length\"),\n", - " batched=True,\n", - " remove_columns=[\"code\"],\n", - ")\n", - "\n", - "\n", - "# Metrics for loss:\n", - "def compute_metrics(eval_pred):\n", - " predictions, labels = eval_pred\n", - " predictions = np.argmax(predictions, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)\n", - "\n", - "\n", - "# Data collator:\n", - "data_collator = DataCollatorForLanguageModeling(\n", - " tokenizer=tokenizer, mlm=False,\n", - ")\n", - "\n", - "# Trainer:\n", - "trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " train_dataset=encoded_dataset,\n", - " tokenizer=tokenizer,\n", - " compute_metrics=compute_metrics,\n", - " data_collator=data_collator,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# Run the actual training:\n", - "trainer.train()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Coding Exercise 4.1: Implement the code to generate text after fine-tuning.\n", - "\n", - "To generate text, we provide input tokens to the model, let it generate the next token and append it into the input tokens. Now, keep repeating this process until you reach the desired output length." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Number of tokens to generate\n", - "num_tokens = 100\n", - "\n", - "# Move the model to the CPU for inference\n", - "model.to(\"cpu\")\n", - "\n", - "# Print input prompt\n", - "print(f'Input prompt: \\n{input_prompt}')\n", - "\n", - "# Encode the input prompt\n", - "# https://huggingface.co/docs/transformers/en/main_classes/tokenizer\n", - "input_tokens = ...\n", - "\n", - "# Turn off storing gradients\n", - "with torch.no_grad():\n", - " # Keep iterating until num_tokens are generated\n", - " for tkn_idx in tqdm(range(num_tokens)):\n", - " # Forward pass through the model\n", - " # The model expects the tensor to be of Long or Int dtype\n", - " output = ...\n", - " # Get output logits\n", - " logits = output.logits[-1, :]\n", - " # Convert into probabilities\n", - " probs = nn.functional.softmax(logits, dim=-1)\n", - " # Get the index of top token\n", - " top_token = ...\n", - " # Append the token into the input sequence\n", - " input_tokens.append(top_token)\n", - "\n", - "# Decode and print the generated text\n", - "# https://huggingface.co/docs/transformers/en/main_classes/tokenizer\n", - "decoded_text = ...\n", - "print(f'Generated text: \\n{decoded_text}')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# to_remove solution\n", - "\n", - "\n", - "# Number of tokens to generate\n", - "num_tokens = 100\n", - "\n", - "# Move the model to the CPU for inference\n", - "model.to(\"cpu\")\n", - "\n", - "# Print input prompt\n", - "print(f'Input prompt: \\n{input_prompt}')\n", - "\n", - "# Encode the input prompt\n", - "# https://huggingface.co/docs/transformers/en/main_classes/tokenizer\n", - "input_tokens = tokenizer.encode(input_prompt)\n", - "\n", - "# Turn off storing gradients\n", - "with torch.no_grad():\n", - " # Keep iterating until num_tokens are generated\n", - " for tkn_idx in tqdm(range(num_tokens)):\n", - " # Forward pass through the model\n", - " output = model(torch.IntTensor(input_tokens))\n", - " # Get output logits\n", - " logits = output.logits[-1, :]\n", - " # Convert into probabilities\n", - " probs = nn.functional.softmax(logits, dim=-1)\n", - " # Get the index of top token\n", - " top_token = torch.argmax(probs).item()\n", - " # Append the token into the input sequence\n", - " input_tokens.append(top_token)\n", - "\n", - "# Decode and print the generated text\n", - "# https://huggingface.co/docs/transformers/en/main_classes/tokenizer\n", - "decoded_text = tokenizer.decode(input_tokens)\n", - "print(f'Generated text: \\n{decoded_text}')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We can also directly generate text using the generation_pipeline:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# Move the model to the CPU for inference\n", - "model.to(\"cpu\")\n", - "print(\n", - " generation_pipeline(\n", - " input_prompt, max_length=100, num_return_sequences=1, temperature=0.2\n", - " )[0][\"generated_text\"]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Of course, your results will be slightly different. Here's what I got:\n", - "\n", - "```javascript\n", - "class SimpleAdder {\n", - " constructor(a, b) {\n", - " this.a = a;\n", - " this.b = b;\n", - " }\n", - "\n", - " add(\n", - "```\n", - "\n", - "Much better! The model is no longer generating Python code, and it's not trying to jam Python-flavored syntax into other languages. It's still imperfect, but it's much better than before! (And, of course, remember that this is just a small model, and we didn't train it for very long. You can either try training it for longer or using a larger model to get better results.)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_FineTune_the_model_Exercise\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "### Think 4.3! Accuracy metric observations\n", - "\n", - "Why might *accuracy* be a bad metric for this task?\n", - "\n", - "**Hint:** What does it mean to be \"accurate\" in this task?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# to_remove explanation\n", - "\n", - "\"\"\"\n", - "Accuracy might be a bad metric for code generation because it only measures the\n", - "exact match between the generated code and the reference code, which ignores the\n", - "fact that there can be multiple ways to implement the same functionality.\n", - "Accuracy also does not account for the logical correctness or the functional\n", - "requirements of the code.\n", - "\"\"\";" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_Accuracy_metric_observations_Discussion\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Section 5: GPT Today and Tomorrow\n", - "\n", - "Limitation of the current models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Video 5: Conclusion\n", - "from ipywidgets import widgets\n", - "from IPython.display import YouTubeVideo\n", - "from IPython.display import IFrame\n", - "from IPython.display import display\n", - "\n", - "\n", - "class PlayVideo(IFrame):\n", - " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", - " self.id = id\n", - " if source == 'Bilibili':\n", - " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", - " elif source == 'Osf':\n", - " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", - " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", - "\n", - "\n", - "def display_videos(video_ids, W=400, H=300, fs=1):\n", - " tab_contents = []\n", - " for i, video_id in enumerate(video_ids):\n", - " out = widgets.Output()\n", - " with out:\n", - " if video_ids[i][0] == 'Youtube':\n", - " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", - " height=H, fs=fs, rel=0)\n", - " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", - " else:\n", - " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", - " height=H, fs=fs, autoplay=False)\n", - " if video_ids[i][0] == 'Bilibili':\n", - " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", - " elif video_ids[i][0] == 'Osf':\n", - " print(f'Video available at https://osf.io/{video.id}')\n", - " display(video)\n", - " tab_contents.append(out)\n", - " return tab_contents\n", - "\n", - "\n", - "video_ids = [('Youtube', 'n1T8X0NiFqo'), ('Bilibili', 'BV1Ha4y1w73S')]\n", - "tab_contents = display_videos(video_ids, W=854, H=480)\n", - "tabs = widgets.Tab()\n", - "tabs.children = tab_contents\n", - "for i in range(len(tab_contents)):\n", - " tabs.set_title(i, video_ids[i][0])\n", - "display(tabs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_Conclusion_Video\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Play around with LLMs\n", - "\n", - "Try the following questions with [ChatGPT](https://openai.com/blog/chatgpt) (GPT3.5 without access to the web) and with GPTBing in creative mode (GPT4 with access to the web). Note that the latter requires installing Microsoft Edge.\n", - "\n", - "Pick someone you know who is likely to have a web presence but is not super famous (not Musk or Trump). Ask GPT for a two-paragraph biography. How good is it?\n", - "\n", - "Ask it something like “What is the US, UK, Germany, China, and Japan's per capita income over the past ten years? Plot the data in a single figure” (depending on when and where you run this, you will need to paste the resulting Python code into a colab notebook). Try asking it questions about the data or the definition of “per capita income” used. How good is it?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_Play_around_with_LLMs_Activity\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Summary\n", - "\n", - "In this tutorial you have become familiar with modern natural language processing (NLP) architectures. We learned about the core concepts, functionalities, and applications of these architectures. We also gain insights into prompt engineering and we learned about GPT." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Daily survey\n", - "\n", - "Don't forget to complete your reflections and content check in the daily survey! Please be patient after logging in as there is a small delay before you will be redirected to the survey.\n", - "\n", - "\"button" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Bonus Section: Using Large Language Models (LLMs)\n", - "\n", - "This videos tells you what large language models are being used for now and how you can use them. For instance, personalized tutoring, language practice, improving writing, exam preparation, writing help and data science." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Video 6: Using GPT\n", - "from ipywidgets import widgets\n", - "from IPython.display import YouTubeVideo\n", - "from IPython.display import IFrame\n", - "from IPython.display import display\n", - "\n", - "\n", - "class PlayVideo(IFrame):\n", - " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", - " self.id = id\n", - " if source == 'Bilibili':\n", - " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", - " elif source == 'Osf':\n", - " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", - " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", - "\n", - "\n", - "def display_videos(video_ids, W=400, H=300, fs=1):\n", - " tab_contents = []\n", - " for i, video_id in enumerate(video_ids):\n", - " out = widgets.Output()\n", - " with out:\n", - " if video_ids[i][0] == 'Youtube':\n", - " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", - " height=H, fs=fs, rel=0)\n", - " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", - " else:\n", - " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", - " height=H, fs=fs, autoplay=False)\n", - " if video_ids[i][0] == 'Bilibili':\n", - " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", - " elif video_ids[i][0] == 'Osf':\n", - " print(f'Video available at https://osf.io/{video.id}')\n", - " display(video)\n", - " tab_contents.append(out)\n", - " return tab_contents\n", - "\n", - "\n", - "video_ids = [('Youtube', 'JdXfuj6RP4Y'), ('Bilibili', 'BV1eX4y1v7c8')]\n", - "tab_contents = display_videos(video_ids, W=854, H=480)\n", - "tabs = widgets.Tab()\n", - "tabs.children = tab_contents\n", - "for i in range(len(tab_contents)):\n", - " tabs.set_title(i, video_ids[i][0])\n", - "display(tabs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Submit your feedback\n", - "content_review(f\"{feedback_prefix}_What_models_Video\")" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "gpuType": "T4", - "include_colab_link": true, - "name": "W3D1_Tutorial2", - "provenance": [], - "toc_visible": true - }, - "gpuClass": "standard", - "kernel": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, - "toc-autonumbering": true - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 732abaa5d9073d6974b7dd9f2e92656a6fdae9a7 Mon Sep 17 00:00:00 2001 From: wangshaonan Date: Mon, 8 Apr 2024 22:31:33 -0400 Subject: [PATCH 2/2] Add files via upload --- .../W3D1_Tutorial2.ipynb | 1906 +++++++++++++++++ 1 file changed, 1906 insertions(+) create mode 100644 tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb diff --git a/tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb b/tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb new file mode 100644 index 000000000..18ac99d2a --- /dev/null +++ b/tutorials/W3D1_TimeSeriesAndNaturalLanguageProcessing/W3D1_Tutorial2.ipynb @@ -0,0 +1,1906 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "execution": {}, + "id": "view-in-github" + }, + "source": [ + "\"Open   \"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "# Tutorial 2: Natural Language Processing and LLMs\n", + "\n", + "**Week 3, Day 1: Time Series and Natural Language Processing**\n", + "\n", + "**By Neuromatch Academy**\n", + "\n", + "__Content creators:__ Lyle Ungar, Jordan Matelsky, Konrad Kording, Shaonan Wang, Alish Dipani\n", + "\n", + "__Content reviewers:__ Shaonan Wang, Weizhe Yuan, Dalia Nasr, Stephen Kiilu, Alish Dipani, Dora Zhiyu Yang, Adrita Das\n", + "\n", + "__Content editors:__ Konrad Kording, Shaonan Wang\n", + "\n", + "__Production editors:__ Konrad Kording, Spiros Chavlis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Tutorial Objectives\n", + "\n", + "This tutorial provides a comprehensive overview of modern natural language processing (NLP). It introduces two influential NLP architectures, BERT and GPT, along with a detailed exploration of the underlying NLP pipeline. Participants will learn about the core concepts, functionalities, and applications of these architectures, as well as gain insights into prompt engineering and the current and future developments of GPT." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Tutorial slides\n", + "from IPython.display import IFrame\n", + "link_id = \"spuj8\"\n", + "print(f\"If you want to download the slides: https://osf.io/download/{link_id}/\")\n", + "IFrame(src=f\"https://mfr.ca-1.osf.io/render?url=https://osf.io/{link_id}/?direct%26mode=render%26action=download%26mode=render\", width=854, height=480)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Install dependencies\n", + "# @markdown **WARNING**: There may be *errors* and/or *warnings* reported during the installation. However, they are to be ignored.\n", + "!pip3 install gensim==4.3.1 --quiet\n", + "!pip3 install pytorch_lightning --quiet\n", + "!pip3 install typing_extensions --quiet\n", + "!pip install accelerate --quiet\n", + "!pip3 install datasets --quiet\n", + "!pip3 install transformers==4.28.0 --quiet\n", + "!pip3 install evaluate --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Install and import feedback gadget\n", + "\n", + "!pip3 install vibecheck datatops --quiet\n", + "\n", + "from vibecheck import DatatopsContentReviewContainer\n", + "def content_review(notebook_section: str):\n", + " return DatatopsContentReviewContainer(\n", + " \"\", # No text prompt\n", + " notebook_section,\n", + " {\n", + " \"url\": \"https://pmyvdlilci.execute-api.us-east-1.amazonaws.com/klab\",\n", + " \"name\": \"neuromatch_dl\",\n", + " \"user_key\": \"f379rz8y\",\n", + " },\n", + " ).render()\n", + "\n", + "\n", + "feedback_prefix = \"W3D1_T2\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "both", + "execution": {} + }, + "outputs": [], + "source": [ + "# Imports\n", + "import random\n", + "import numpy as np\n", + "from typing import Iterable, List\n", + "from tqdm.notebook import tqdm\n", + "from typing import Dict\n", + "import pytorch_lightning as pl\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import DataLoader, Dataset\n", + "from tokenizers import Tokenizer, Regex, models, normalizers, pre_tokenizers, trainers, processors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Set random seed\n", + "\n", + "# @markdown Executing `set_seed(seed=seed)` you are setting the seed\n", + "\n", + "# for DL its critical to set the random seed so that students can have a\n", + "# baseline to compare their results to expected results.\n", + "# Read more here: https://pytorch.org/docs/stable/notes/randomness.html\n", + "\n", + "# Call `set_seed` function in the exercises to ensure reproducibility.\n", + "import random\n", + "import numpy as np\n", + "\n", + "def set_seed(seed=None):\n", + " if seed is None:\n", + " seed = np.random.choice(2 ** 32)\n", + " random.seed(seed)\n", + " np.random.seed(seed)\n", + " print(f'Random seed {seed} has been set.')\n", + "\n", + "\n", + "set_seed(seed=2023) # change 2023 with any number you like" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Set device (GPU or CPU). Execute `set_device()`\n", + "\n", + "# Inform the user if the notebook uses GPU or CPU.\n", + "\n", + "def set_device():\n", + " \"\"\"\n", + " Set the device. CUDA if available, CPU otherwise\n", + "\n", + " Args:\n", + " None\n", + "\n", + " Returns:\n", + " Nothing\n", + " \"\"\"\n", + " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + " if device != \"cuda\":\n", + " print(\"WARNING: For this notebook to perform best, \"\n", + " \"if possible, in the menu under `Runtime` -> \"\n", + " \"`Change runtime type.` select `GPU` \")\n", + " else:\n", + " print(\"GPU is enabled in this notebook.\")\n", + "\n", + " return device" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "DEVICE = set_device()\n", + "SEED = 2021\n", + "set_seed(seed=SEED)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "\n", + "# Section 1: NLP architectures\n", + "\n", + "From RNN/LSTM to Transformers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Video 1: Intro to NLPs and LLMs\n", + "from ipywidgets import widgets\n", + "from IPython.display import YouTubeVideo\n", + "from IPython.display import IFrame\n", + "from IPython.display import display\n", + "\n", + "\n", + "class PlayVideo(IFrame):\n", + " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", + " self.id = id\n", + " if source == 'Bilibili':\n", + " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", + " elif source == 'Osf':\n", + " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", + " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", + "\n", + "\n", + "def display_videos(video_ids, W=400, H=300, fs=1):\n", + " tab_contents = []\n", + " for i, video_id in enumerate(video_ids):\n", + " out = widgets.Output()\n", + " with out:\n", + " if video_ids[i][0] == 'Youtube':\n", + " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", + " height=H, fs=fs, rel=0)\n", + " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", + " else:\n", + " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", + " height=H, fs=fs, autoplay=False)\n", + " if video_ids[i][0] == 'Bilibili':\n", + " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", + " elif video_ids[i][0] == 'Osf':\n", + " print(f'Video available at https://osf.io/{video.id}')\n", + " display(video)\n", + " tab_contents.append(out)\n", + " return tab_contents\n", + "\n", + "\n", + "video_ids = [('Youtube', 'PCz527-WbxY'), ('Bilibili', 'BV15V4y1a7Xu')]\n", + "tab_contents = display_videos(video_ids, W=854, H=480)\n", + "tabs = widgets.Tab()\n", + "tabs.children = tab_contents\n", + "for i in range(len(tab_contents)):\n", + " tabs.set_title(i, video_ids[i][0])\n", + "display(tabs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "A core principle of Natural Language Processing is embedding words as vectors. In the relevant vector space, words with similar meanings are close to one another.\n", + "\n", + "In classical transformer systems, a core principle is encoding and decoding. We can encode an input sequence as a vector (that implicitly codes what we just read). And we can then take this vector and decode it, e.g., as a new sentence. So a sequence-to-sequence (e.g., sentence translation) system may read a sentence (made out of words embedded in a relevant space) and encode it as an overall vector. It then takes the resulting encoding of the sentence and decodes it into a translated sentence.\n", + "\n", + "In modern transformer systems, such as GPT, all words are used parallelly. In that sense, the transformers generalize the encoding/decoding idea. Examples of this strategy include all the modern large language models (such as GPT)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_Intro_to_NLPs_and_LLMs_Video\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Section 2: The NLP pipeline\n", + "\n", + "Tokenize, pretrain, fine-tune" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Video 2: NLP pipeline\n", + "from ipywidgets import widgets\n", + "from IPython.display import YouTubeVideo\n", + "from IPython.display import IFrame\n", + "from IPython.display import display\n", + "\n", + "\n", + "class PlayVideo(IFrame):\n", + " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", + " self.id = id\n", + " if source == 'Bilibili':\n", + " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", + " elif source == 'Osf':\n", + " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", + " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", + "\n", + "\n", + "def display_videos(video_ids, W=400, H=300, fs=1):\n", + " tab_contents = []\n", + " for i, video_id in enumerate(video_ids):\n", + " out = widgets.Output()\n", + " with out:\n", + " if video_ids[i][0] == 'Youtube':\n", + " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", + " height=H, fs=fs, rel=0)\n", + " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", + " else:\n", + " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", + " height=H, fs=fs, autoplay=False)\n", + " if video_ids[i][0] == 'Bilibili':\n", + " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", + " elif video_ids[i][0] == 'Osf':\n", + " print(f'Video available at https://osf.io/{video.id}')\n", + " display(video)\n", + " tab_contents.append(out)\n", + " return tab_contents\n", + "\n", + "\n", + "video_ids = [('Youtube', 'uPnTVbc4qUE'), ('Bilibili', 'BV1TM4y1E7ab')]\n", + "tab_contents = display_videos(video_ids, W=854, H=480)\n", + "tabs = widgets.Tab()\n", + "tabs.children = tab_contents\n", + "for i in range(len(tab_contents)):\n", + " tabs.set_title(i, video_ids[i][0])\n", + "display(tabs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_NLP_pipeline_Video\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Tokenizers\n", + "\n", + "Today we will practise embedding techniques, and continue our march toward large language models and transformers by discussing one of the critical developments of the modern NLP stack: **Tokenization.** Tokenizers convert inputs as a set of discrete tokens.\n", + "\n", + "### Learning Goals\n", + "\n", + "* Understand the concept of tokenization and why it is useful.\n", + "* Learn how to write a tokenizer from scratch, taking advantage of context.\n", + "* Get an intuition for how modern tokenizers work by playing with a few pre-trained tokenizers from industry." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Generating a dataset\n", + "\n", + "As we continue to move closer to \"production-grade\" NLP, we'll start to use industry standards such as the [HuggingFace](https://huggingface.co/) library. Huggingface is a large company that facilitates the exchange of aspects of modern deep learning systems.\n", + "\n", + "We'll start by generating a training dataset. `hf` has a convenient `datasets` module that allows us to download a variety of datasets, including the [Wikipedia text corpus](https://huggingface.co/datasets/wiki_text). We'll use this to generate a dataset of text from Wikipedia." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=\"train\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "print(dataset[41492])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "def generate_n_examples(dataset, n=512):\n", + " \"\"\"\n", + " Produce a generator that yields n examples at a time from the dataset.\n", + " \"\"\"\n", + " for i in range(0, len(dataset), n):\n", + " yield dataset[i:i + n]['text']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Now we will create the actual `Tokenizer`, adhering to the [`hf.Tokenizer` protocol](https://huggingface.co/docs/transformers/main_classes/tokenizer). (Adhering to a standard protocol enables us to swap in our tokenizer for any tokenizer in the huggingface ecosystem or to apply our own tokenizer to any model in the huggingface ecosystem.)\n", + "\n", + "Let's sketch out the steps of writing a Tokenizer. We need to solve two problems:\n", + "\n", + "* Given a string, split it into a list of tokens.\n", + "* If you don't recognize a word, still figure out a way to tokenize it!\n", + "\n", + "This may feel like we're reinventing our one-hot encoder with a richer vocabulary. Why is it that the One-Hot-Encoder, which outputs a vector of length $|V|$, where $|V|$ is the size of our vocabulary, is not sufficient, but a tokenizer that outputs a list of indices into a vocabulary of size $|V|$ is sufficient? The answer is that while our encoder was responsible for embedding words into a high-dimensional space, our tokenizer is NOT; the \"win\" of a tokenizer is that it breaks up a string into in-vocab elements. For certain workflows, the very next step might be adding an embedder onto the end of the tokenizer. (As we'll soon see, this is exactly the strategy employed by modern Transformer models.)\n", + "\n", + "Tokens will almost always be different from words; for example, we might want to split \"don't\" into \"do\" and \"n't\", or we might want to split \"don't\" into \"do\" and \"not\". Or we might even want to split \"don't\" into \"d\", \"o\", \"n\", and \"t\". We can choose any strategy we want here; **, unlike Word2Vec, our tokenizer will NOT be limited to outputting one vector per English word.** Here, we'll use an off-the-shelf subword splitter, which we discuss below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "VOCAB_SIZE = 12_000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# Create a tokenizer object that uses the \"WordPiece\" model. The WorkPiece model\n", + "# is a subword tokenizer that uses a vocabulary of common words and word pieces\n", + "# to tokenize text. The \"unk_token\" parameter specifies the token to use for\n", + "# unknown tokens, i.e. tokens that are not in the vocabulary. (Remember that the\n", + "# vocabulary will be built from our dataset, so it will include subchunks of\n", + "# English words.)\n", + "tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Tokenizer Features\n", + "\n", + "Now let's start dressing up our tokenizer with some useful features. First, let's clean up the text. This process is formally called \"normalization\" and is a critical step in any NLP pipeline. We'll remove punctuation and then convert all the text to lowercase. We'll also remove diacritics (accents) from the text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# Think of a Normalizer Sequence the same way you would think of a PyTorch\n", + "# Sequential model. It is a sequence of normalizers that are applied to the\n", + "# text before tokenization, in the order that they are added to the sequence.\n", + "\n", + "tokenizer.normalizer = normalizers.Sequence([\n", + " normalizers.Replace(Regex(r\"[\\s]\"), \" \"), # Convert all whitespace to single space\n", + " normalizers.Lowercase(), # Convert all text to lowercase\n", + " normalizers.NFD(), # Decompose all characters into their base characters\n", + " normalizers.StripAccents(), # Remove all accents\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Next, we'll add a pre-tokenizer. The pre-tokenizer is applied to the text after normalizing it but before it's tokenized. The pre-tokenizer is useful for splitting text into chunks, which are easier to tokenize. For example, we can split text into chunks separated by punctuation or whitespace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "tokenizer.pre_tokenizer = pre_tokenizers.Sequence([\n", + " pre_tokenizers.WhitespaceSplit(), # Split on whitespace\n", + " pre_tokenizers.Digits(individual_digits=True), # Split digits into individual tokens\n", + " pre_tokenizers.Punctuation(), # Split punctuation into individual tokens\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note:** In practice, it is not necessary to use pre-tokenizers, but we use it for demonstration purposes. For instance, \"2-3\" is not the same as \"23\", so removing punctuation or splitting up digits or punctuation is a bad idea! Moreover, the current tokenizer is powerful enough to deal with punctuation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Finally, we'll train the tokenizer with our dataset. After all, we want a tokenizer that works well on this dataset. There are a few different algorithms for training tokenizers. Here are two common ones:\n", + "\n", + "* BPE Algorithm: Start with a vocabulary of each character in the dataset. Examine all pairs from the vocabulary and merge the pair with the highest frequency in the dataset. Repeat until the vocabulary size is reached (so \"ee\" is more likely to get merged than \"zf\" in the English corpus).\n", + "* Top-Down WordPiece Algorithm: Generate all substrings of each word from the dataset and count occurrences in the training data. Keep any string that occurs more than a threshold number of times. Repeat this process until the vocabulary size is reached (For a more thorough explanation of this process, see [the TensorFlow Guide](https://www.tensorflow.org/text/guide/subwords_tokenizer#optional_the_algorithm))\n", + "\n", + "We'll use WordPiece in the next cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "tokenizer_trainer = trainers.WordPieceTrainer(\n", + " vocab_size=VOCAB_SIZE,\n", + " # We have to specify the special tokens that we want to use. These will be\n", + " # added to the vocabulary no matter what the vocab-building algorithm does.\n", + " special_tokens=[\"[PAD]\", \"[UNK]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"],\n", + " show_progress=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Special Tokens\n", + "\n", + "Tokenizers often have special tokens representing certain concepts such as:\n", + "* [PAD]: Added to the end of shorter input sequences to ensure equal input length for the whole batch\n", + "* [START]: Start of the sequence\n", + "* [END]: End of the sequence\n", + "* [UNK]: Unknown characters not present in the vocabulary\n", + "* [BOS]: Beginning of sentence\n", + "* [EOS]: End of sentence\n", + "* [SEP]: Separation between two sentences in a sequence\n", + "* [CLS]: Token used for classification tasks to represent the whole sequence\n", + "* [MASK]: Used in pre-training phase for masked language modeling tasks in models like BERT" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Those special tokens are important because it tells the WordPiece training process how to treat phrases, masks, and unknown tokens.\n", + "\n", + "**Note:** We can also add our own special tokens, such as `[CITE]`, to indicate when a citation is about to be used if we want to train a model to predict the presence of citations in a text. Training this will take a bit of time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "sample_ratio = 0.2\n", + "keep = int(len(dataset)*sample_ratio)\n", + "dataset_small = load_dataset(\"wikitext\", \"wikitext-103-raw-v1\", split=f\"train[:{keep}]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "tokenizer.train_from_iterator(generate_n_examples(dataset_small), trainer=tokenizer_trainer, length=len(dataset_small))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# In \"real life\", we'd probably want to save the tokenizer to disk so that we\n", + "# can use it later. We can do this with the \"save\" method:\n", + "# tokenizer.save(\"tokenizer.json\")\n", + "\n", + "# Let's try it out!\n", + "print(\"Hello, world!\")\n", + "print(\n", + " *zip(\n", + " tokenizer.encode(\"Hello, world!\").tokens,\n", + " tokenizer.encode(\"Hello, world!\").ids,\n", + " )\n", + ")\n", + "\n", + "\n", + "# Can we also tokenize made-up words?\n", + "print(tokenizer.encode(\"These toastersocks are so groommpy!\").tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "(The `##` means that the token is a continuation of the previous chunk.)\n", + "\n", + "Try playing around with the hyperparameters and the tokenizing algorithms to see how they affect the tokenizer's output. There can be some very major differences!\n", + "\n", + "In summary, we created a tokenizer pipeline that:\n", + "\n", + "* Normalizes the text (cleans up punctuation and diacritics)\n", + "* Splits the text into chunks (using whitespace and punctuation)\n", + "* Trains the tokenizer on the dataset (using the WordPiece algorithm)\n", + "\n", + "In common use, this would be the first step of any modern NLP pipeline. The next step would be to add an embedder to the end of the tokenizer, so that we can feed in a high-dimensional space to our model. But unlike Word2Vec, we can now separate the tokenization step from the embedding step, which means our encoding/embedding process can be task-specific, custom to our downstream neural net architecture, instead of general-purpose." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "### Think 2.1! Tokenizer good practices\n", + "\n", + "We established that the tokenizer is a better move than the One-Hot-Encoder because it can handle out-of-vocabulary words. But what if we just made a one-hot encoding where the vocabulary is all possible two-character combinations? Would there still be an advantage to the tokenizer?\n", + "\n", + "**Hint:** Re-read the section on the BPE and WordPiece algorithms, and how the tokens are selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# to_remove explanation\n", + "\n", + "\"\"\"\n", + "If we used a one-hot encoding where the vocabulary is all possible two-character\n", + "combinations, we would still face some problems that the tokenizer can solve.\n", + "Here are some of them:\n", + "\n", + "* The vocabulary size would be very large, since there are 26^2 = 676 possible\n", + "two-character combinations in English. This would make the one-hot vectors\n", + "very sparse and high-dimensional, which can affect the efficiency and\n", + "performance of the model.\n", + "* The one-hot encoding would not capture any semantic or syntactic information\n", + "about the words, since each two-character combination would be treated as an\n", + "independent unit. This would make it harder for the model to learn meaningful\n", + "representations of the words and their contexts.\n", + "* The one-hot encoding would not handle rare or unseen words well, since\n", + "it would either ignore them or assign them to a generic unknown token.\n", + "This would limit the generalization ability of the model and reduce its\n", + "accuracy on new data.\n", + "\n", + "\n", + "The tokenizer, on the other hand, can overcome these problems by using subword\n", + "units that are based on the frequency and co-occurrence of characters\n", + "in the corpus. The tokenizer can:\n", + "\n", + "* Reduce the vocabulary size by merging frequent and meaningful subword units\n", + "into larger tokens. For example, instead of having separate tokens\n", + "for “in”, “ing”, “tion”, etc., the tokenizer can merge them into a single token\n", + "that represents a common suffix.\n", + "* Capture some semantic and syntactic information about the words, since the\n", + "subword units are derived from the data and reflect how words are composed and\n", + "used. For example, the tokenizer can split a word like “unhappy” into “un” and\n", + "“happy”, which preserves some information about its meaning and structure.\n", + "* Handle rare or unseen words better, since it can split them into smaller\n", + "subword units that are likely to be in the vocabulary. For example, if the word\n", + "“neural” is not in the vocabulary, the tokenizer can split it into “neu” and\n", + "“ral”, which are more likely to be seen in other words.\n", + "\n", + "Therefore, there is still an advantage to using the tokenizer over the\n", + "one-hot encoding, even if we use all possible two-character combinations\n", + "as the vocabulary. The tokenizer can create more compact, informative, and\n", + "flexible representations of words that can improve the performance of the model.\n", + "\"\"\";" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_Tokenizer_good_practices_Discussion\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "### Think 2.2: Chinese and English tokenizer\n", + "\n", + "Let's think about a language like Chinese, where words are each composed of a relatively fewer number of characters compared to English (`hungry` is six unicode characters, but `饿` is one unicode character), but there are many more unique Chinese characters than there are letters in the English alphabet.\n", + "\n", + "In a one or two sentence high-level sketch, what properties would be desireable for a Chinese tokenizer to have?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# to_remove explanation\n", + "\n", + "\"\"\"\n", + "For instance, it should be able to segment words based on the meaning and usage\n", + "of the characters, rather than relying on spaces or punctuation.\n", + "For example, it should recognize that “北京” is a single word meaning “Beijing”,\n", + "rather than two separate characters.\n", + "\"\"\";" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_Chinese_and_English_tokenizer_Discussion\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Section 3: Using BERT\n", + "\n", + "In this section, we will learn about using the BERT model from huggingface.\n", + "\n", + "## Learning Goals\n", + "* Understand the idea behind BERT\n", + "* Understand the idea of pre-training and fine-tuning\n", + "* Understand how freezing parts of the network is useful" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Video 3: BERT\n", + "from ipywidgets import widgets\n", + "from IPython.display import YouTubeVideo\n", + "from IPython.display import IFrame\n", + "from IPython.display import display\n", + "\n", + "\n", + "class PlayVideo(IFrame):\n", + " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", + " self.id = id\n", + " if source == 'Bilibili':\n", + " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", + " elif source == 'Osf':\n", + " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", + " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", + "\n", + "\n", + "def display_videos(video_ids, W=400, H=300, fs=1):\n", + " tab_contents = []\n", + " for i, video_id in enumerate(video_ids):\n", + " out = widgets.Output()\n", + " with out:\n", + " if video_ids[i][0] == 'Youtube':\n", + " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", + " height=H, fs=fs, rel=0)\n", + " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", + " else:\n", + " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", + " height=H, fs=fs, autoplay=False)\n", + " if video_ids[i][0] == 'Bilibili':\n", + " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", + " elif video_ids[i][0] == 'Osf':\n", + " print(f'Video available at https://osf.io/{video.id}')\n", + " display(video)\n", + " tab_contents.append(out)\n", + " return tab_contents\n", + "\n", + "\n", + "video_ids = [('Youtube', 'u4D-84Z1Fxs'), ('Bilibili', 'BV17u411b7gJ')]\n", + "tab_contents = display_videos(video_ids, W=854, H=480)\n", + "tabs = widgets.Tab()\n", + "tabs.children = tab_contents\n", + "for i in range(len(tab_contents)):\n", + " tabs.set_title(i, video_ids[i][0])\n", + "display(tabs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_BERT_Video\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "# Section 4: NLG with GPT\n", + "\n", + "In this section we will learn about Natural Language Generation with Generative Pretrained Transformers.\n", + "\n", + "## Learning goals\n", + "* How to produce language with GPTs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Video 4: NLG\n", + "from ipywidgets import widgets\n", + "from IPython.display import YouTubeVideo\n", + "from IPython.display import IFrame\n", + "from IPython.display import display\n", + "\n", + "\n", + "class PlayVideo(IFrame):\n", + " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", + " self.id = id\n", + " if source == 'Bilibili':\n", + " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", + " elif source == 'Osf':\n", + " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", + " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", + "\n", + "\n", + "def display_videos(video_ids, W=400, H=300, fs=1):\n", + " tab_contents = []\n", + " for i, video_id in enumerate(video_ids):\n", + " out = widgets.Output()\n", + " with out:\n", + " if video_ids[i][0] == 'Youtube':\n", + " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", + " height=H, fs=fs, rel=0)\n", + " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", + " else:\n", + " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", + " height=H, fs=fs, autoplay=False)\n", + " if video_ids[i][0] == 'Bilibili':\n", + " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", + " elif video_ids[i][0] == 'Osf':\n", + " print(f'Video available at https://osf.io/{video.id}')\n", + " display(video)\n", + " tab_contents.append(out)\n", + " return tab_contents\n", + "\n", + "\n", + "video_ids = [('Youtube', 'vwFMHitq-FY'), ('Bilibili', 'BV1Hu411b7dx')]\n", + "tab_contents = display_videos(video_ids, W=854, H=480)\n", + "tabs = widgets.Tab()\n", + "tabs.children = tab_contents\n", + "for i in range(len(tab_contents)):\n", + " tabs.set_title(i, video_ids[i][0])\n", + "display(tabs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_NLG_Video\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Using state-of-the-art (SOTA) Models\n", + "\n", + "Unless you are writing your own experimental DL research (and sometimes even then!) it is _far_ more common these days to use the HuggingFace model library to import and start working with state-of-the-art models quickly. In this section, we will show you how to do that.\n", + "\n", + "We will download a pretrained model from the hf `transformers` library that is used to generate text. We will then fine-tune it on a different dataset, using the `hf.datasets` library and the HuggingFace Trainer classes to make the process as easy as possible, and we'll see that we can accomplish all of this in just a few lines of easily maintained code.\n", + "\n", + "Ultimately, we will have a _working_ generator... for code!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We're first going to pick a tokenizer. You can see some of the options [here](https://huggingface.co/transformers/pretrained_models.html). We'll use CodeParrot tokenizer, which is a BPE tokenizer. But you can choose (or build!) another if you'd like to try offroading!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(\"codeparrot/codeparrot-small\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "### Think 4.1! Tokenizers\n", + "\n", + "Why can you use a different tokenizer than the one that was originally used? What requirements must another tokenizer for this task have?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# to_remove explanation\n", + "\n", + "\"\"\"\n", + "You couldn't, for example, use the very popular `bert-base-uncased` tokenizer,\n", + "even though it's a popular choice for text generation tasks that were trained\n", + "on the English Wikipedia and the BookCorpus datasets (which are both available\n", + "in the `hf.datasets` library).\n", + "\"\"\";" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_Tokenizers_Discussion\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Next, we'll download a pre-built model architecture. CodeParrot (the model) is a GPT-2 model, which is a transformer-based language model. You can see some of the options [here](https://huggingface.co/transformers/pretrained_models.html). But you can choose (or build!) another!\n", + "\n", + "Note that `codeparrot/codeparrot` (https://huggingface.co/codeparrot/codeparrot) is about 7GB to download (so it may take a while, or it may be too large for your runtime if you're on a free Colab). Instead, we will use a smaller model, `codeparrot/codeparrot-small` (https://huggingface.co/codeparrot/codeparrot-small), which is only ~500MB.\n", + "\n", + "To run everything together — tokenization, model, and de-tokenization, we can use the `pipeline` function from `transformers`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "from transformers import AutoModelWithLMHead\n", + "from transformers import pipeline\n", + "\n", + "model = AutoModelWithLMHead.from_pretrained(\"codeparrot/codeparrot-small\")\n", + "generation_pipeline = pipeline(\n", + " \"text-generation\", # The task to run. This tells hf what the pipeline steps are\n", + " model=model, # The model to use; can also pass the string here;\n", + " tokenizer=tokenizer, # The tokenizer to use; can also pass the string name here.\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "input_prompt = '''\\\n", + "def simple_add(a: int, b: int) -> int:\n", + " \"\"\"\n", + " Adds two numbers together and returns the result.\n", + " \"\"\"'''\n", + "\n", + "# Return tensors for PyTorch:\n", + "inputs = tokenizer(input_prompt, return_tensors=\"pt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Recall that these tokens are integer indices in the vocabulary of the tokenizer. We can use the tokenizer to decode these tokens into a string, which we can print out to see what the model generates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "input_token_ids = inputs[\"input_ids\"]\n", + "input_strs = tokenizer.convert_ids_to_tokens(*input_token_ids.tolist())\n", + "\n", + "print(*zip(input_strs, input_token_ids[0]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "**(Quick knowledge-check: what are the weirdly-rendering characters representing?)**\n", + "\n", + "This model is already ready to use! Let's give it a try. (Note that we don't use `inputs` — we just generated that to show the initial tokenization steps.)\n", + "\n", + "Here, we use the `pipeline` we created earlier to combine all our components. If you were writing a Copilot-style code-completer, you could get away with wrapping this single line in a nice API and calling it a day!\n", + "\n", + "Play with the hyperparameters and see what kinds of outputs you can get. Temperature measures how much randomness is added to the model's predictions. Higher temperature means more randomness and lower temperature means less randomness. More randomness in the latent space will lead to wilder predictions and potentially more creative answers. A good place to start is `0.2`. You can also try changing the `max_length` parameter, which controls how long the generated code can be (though the model can opt to put a \"stop\" token in the middle of the sequence, so it may not always generate exactly this many tokens)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "outputs = generation_pipeline(input_prompt, max_length=100, num_return_sequences=1, temperature=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "print(outputs[0][\"generated_text\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Let's see if we can fool our model now! The huggingface documentation tells us that the codeparrot model was trained to generate Python code ([docs](https://huggingface.co/codeparrot/codeparrot-small)). Let's see if we can get it to generate some JavaScript." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "input_prompt = \"class SimpleAdder {\"\n", + "\n", + "print(generation_pipeline(input_prompt, max_length=100, num_return_sequences=1, temperature=0.2)[0][\"generated_text\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Yikes! I don't know what it generated for you, but what it made for me was:\n", + "\n", + "```python\n", + "class SimpleAdder {\n", + " public:\n", + " class SimpleAdder(object):\n", + " def __init__(self, a, b):\n", + " self.a = a\n", + " self.b = b\n", + "\n", + " def __call__(self, x):\n", + " return self.a + x\n", + "```\n", + "\n", + "**Ew!** That's wrong in a _lot_ of ways. But it's understandable: Our model can't really generalize outside of the domain in which it was trained. And so probably there were a few Python files that included syntax of other languages (perhaps generators for other code?). So the model knows that there's some mysterious syntax that uses curly brackets... But it's not sure about anything else. (For the programming-language hobbyists among you: The `public` notation looks to me a lot like the model is trying to do something C-flavored and perhaps something Java-flavored; I like it! But it's definitely not JavaScript.)\n", + "\n", + "What are the major observations?\n", + "\n", + "* The syntax it's generating rapidly and devolves into Python; it can predict only a few characters of non-Python before falling back into its familiar training territory.\n", + "* The part of the code that follows Python syntax is valid and resembles a useful class definition (although if you look closely, it doesn't seem to do anything useful with the `b` attribute...). This tells us that the model \"understands\" its problem domain but hasn't been trained on the correct data to solve our new problem." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "### Think 4.2! Using SOTA models\n", + "\n", + "What are your other observations about the code it generated for you? You're now aware of how Transformers work.\n", + "\n", + "1. Think specifically and remark about the observations a machine learning practitioner would make here if your role were to diagnose the error in a production system.\n", + "2. Now, how would a nonexpert user interpret the issues?\n", + "3. Do you think the model-reported confidence for this output would be high, low, or in between...?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# to_remove explanation\n", + "\n", + "\"\"\"\n", + "Here is one possible answer.\n", + "1. The model is not well-trained or fine-tuned on the task of generating Python\n", + "code from natural language instructions. It may have insufficient data,\n", + "low quality data, or inappropriate hyperparameters.\n", + "2. The model is not smart or reliable enough to write code for them.\n", + "It may have bugs, glitches, or limitations that prevent it from working properly.\n", + "3. I think the model-reported confidence for this output would be low, since\n", + "the output has many errors and deviations from the instructions. However, the\n", + "confidence may also depend on how the model is trained and calibrated, and how\n", + "it estimates its own uncertainty and quality.\n", + "\"\"\";" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_Using_SOTA_models_Discussion\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Fine-Tuning\n", + "\n", + "Alright, so we have a model that can generate code. But now, we want to fine-tune it to generate JavaScript.\n", + "\n", + "Assuming the data will be too large to fit on disk on Colab, we'll use the `load_dataset` function to download only part of the dataset. There's a JavaScript subset to the codeparrot dataset, which we'll use as an example… But you can use any dataset you like! We recommend filtering datasets by task category (e.g., text generation) to get the most relevant datasets. Still, you can use any dataset you like if you can configure the data loader to use it. (Consider, for example, [this one](https://huggingface.co/datasets/angie-chen55/javascript-github-code).)\n", + "\n", + "> **Choose a dataset from the [HuggingFace datasets library](https://huggingface.co/datasets?task_categories=task_categories:text-generation&sort=downloads).**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# Unlike _some_ code-generator models on the market, we'll limit our training data by license :)\n", + "dataset = load_dataset(\n", + " \"codeparrot/github-code\",\n", + " streaming=True,\n", + " split=\"train\",\n", + " languages=[\"JavaScript\"],\n", + " licenses=[\"mit\", \"isc\", \"apache-2.0\"],\n", + ")\n", + "# Print the schema of the first example from the training set:\n", + "print({k: type(v) for k, v in next(iter(dataset)).items()})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Like training any model, we need to define a training loop and an evaluation metric.\n", + "\n", + "This is made overwhelmingly easy with the `transformers` library. Specifically, look below at all of the code you can avoid using the huggingface infrastructure. (In the past, we've used PyTorch Lightning, which had a similar training-loop abstraction. Do you have preferences between these two libraries?)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "### Implement the code to fine-tune the model\n", + "\n", + "Here are the big pieces of what we do below:\n", + "\n", + "* **Create a `TrainingArguments` object.** This serializable object (i.e., you can save it to memory or disk) makes it easy to train a model reproducibly with the same hyperparameters (this certainly beats having a bunch of global variables in your notebook!).\n", + "* **Encode the dataset.** This is effectively just passing everything through the tokenizer, with a padding step that fills the end of each sequence with the padding token.\n", + "* **Define our metrics.** We use the `accuracy` metric here (look at the 4th line in the code cell).\n", + "* **Create a data collator.** This function takes a list of examples and returns a batch of examples. The `DataCollatorForLanguageModeling` class is a convenient way to do this.\n", + "* **Create a `Trainer` object.** This class wraps the training loop and makes it easy to train a model. It's a bit like the `Trainer` class in PyTorch Lightning, but it's a bit more flexible and works with non-PyTorch models as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling\n", + "from evaluate import load\n", + "metric = load(\"accuracy\")\n", + "\n", + "# Trainer:\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./codeparrot\",\n", + " max_steps=100,\n", + " per_device_train_batch_size=1,\n", + ")\n", + "\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + "encoded_dataset = dataset.map(\n", + " lambda x: tokenizer(x[\"code\"], truncation=True, padding=\"max_length\"),\n", + " batched=True,\n", + " remove_columns=[\"code\"],\n", + ")\n", + "\n", + "\n", + "# Metrics for loss:\n", + "def compute_metrics(eval_pred):\n", + " predictions, labels = eval_pred\n", + " predictions = np.argmax(predictions, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", + "\n", + "\n", + "# Data collator:\n", + "data_collator = DataCollatorForLanguageModeling(\n", + " tokenizer=tokenizer, mlm=False,\n", + ")\n", + "\n", + "# Trainer:\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=encoded_dataset,\n", + " tokenizer=tokenizer,\n", + " compute_metrics=compute_metrics,\n", + " data_collator=data_collator,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# Run the actual training:\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Coding Exercise 4.1: Implement the code to generate text after fine-tuning.\n", + "\n", + "To generate text, we provide input tokens to the model, let it generate the next token and append it into the input tokens. Now, keep repeating this process until you reach the desired output length." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Number of tokens to generate\n", + "num_tokens = 100\n", + "\n", + "# Move the model to the CPU for inference\n", + "model.to(\"cpu\")\n", + "\n", + "# Print input prompt\n", + "print(f'Input prompt: \\n{input_prompt}')\n", + "\n", + "# Encode the input prompt\n", + "# https://huggingface.co/docs/transformers/en/main_classes/tokenizer\n", + "input_tokens = ...\n", + "\n", + "# Turn off storing gradients\n", + "with torch.no_grad():\n", + " # Keep iterating until num_tokens are generated\n", + " for tkn_idx in tqdm(range(num_tokens)):\n", + " # Forward pass through the model\n", + " # The model expects the tensor to be of Long or Int dtype\n", + " output = ...\n", + " # Get output logits\n", + " logits = output.logits[-1, :]\n", + " # Convert into probabilities\n", + " probs = nn.functional.softmax(logits, dim=-1)\n", + " # Get the index of top token\n", + " top_token = ...\n", + " # Append the token into the input sequence\n", + " input_tokens.append(top_token)\n", + "\n", + "# Decode and print the generated text\n", + "# https://huggingface.co/docs/transformers/en/main_classes/tokenizer\n", + "decoded_text = ...\n", + "print(f'Generated text: \\n{decoded_text}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# to_remove solution\n", + "\n", + "\n", + "# Number of tokens to generate\n", + "num_tokens = 100\n", + "\n", + "# Move the model to the CPU for inference\n", + "model.to(\"cpu\")\n", + "\n", + "# Print input prompt\n", + "print(f'Input prompt: \\n{input_prompt}')\n", + "\n", + "# Encode the input prompt\n", + "# https://huggingface.co/docs/transformers/en/main_classes/tokenizer\n", + "input_tokens = tokenizer.encode(input_prompt)\n", + "\n", + "# Turn off storing gradients\n", + "with torch.no_grad():\n", + " # Keep iterating until num_tokens are generated\n", + " for tkn_idx in tqdm(range(num_tokens)):\n", + " # Forward pass through the model\n", + " output = model(torch.IntTensor(input_tokens))\n", + " # Get output logits\n", + " logits = output.logits[-1, :]\n", + " # Convert into probabilities\n", + " probs = nn.functional.softmax(logits, dim=-1)\n", + " # Get the index of top token\n", + " top_token = torch.argmax(probs).item()\n", + " # Append the token into the input sequence\n", + " input_tokens.append(top_token)\n", + "\n", + "# Decode and print the generated text\n", + "# https://huggingface.co/docs/transformers/en/main_classes/tokenizer\n", + "decoded_text = tokenizer.decode(input_tokens)\n", + "print(f'Generated text: \\n{decoded_text}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We can also directly generate text using the generation_pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# Move the model to the CPU for inference\n", + "model.to(\"cpu\")\n", + "print(\n", + " generation_pipeline(\n", + " input_prompt, max_length=100, num_return_sequences=1, temperature=0.2\n", + " )[0][\"generated_text\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Of course, your results will be slightly different. Here's what I got:\n", + "\n", + "```javascript\n", + "class SimpleAdder {\n", + " constructor(a, b) {\n", + " this.a = a;\n", + " this.b = b;\n", + " }\n", + "\n", + " add(\n", + "```\n", + "\n", + "Much better! The model is no longer generating Python code, and it's not trying to jam Python-flavored syntax into other languages. It's still imperfect, but it's much better than before! (And, of course, remember that this is just a small model, and we didn't train it for very long. You can either try training it for longer or using a larger model to get better results.)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_FineTune_the_model_Exercise\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "### Think 4.3! Accuracy metric observations\n", + "\n", + "Why might *accuracy* be a bad metric for this task?\n", + "\n", + "**Hint:** What does it mean to be \"accurate\" in this task?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# to_remove explanation\n", + "\n", + "\"\"\"\n", + "Accuracy might be a bad metric for code generation because it only measures the\n", + "exact match between the generated code and the reference code, which ignores the\n", + "fact that there can be multiple ways to implement the same functionality.\n", + "Accuracy also does not account for the logical correctness or the functional\n", + "requirements of the code.\n", + "\"\"\";" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_Accuracy_metric_observations_Discussion\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Section 5: GPT Today and Tomorrow\n", + "\n", + "Limitation of the current models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Video 5: Conclusion\n", + "from ipywidgets import widgets\n", + "from IPython.display import YouTubeVideo\n", + "from IPython.display import IFrame\n", + "from IPython.display import display\n", + "\n", + "\n", + "class PlayVideo(IFrame):\n", + " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", + " self.id = id\n", + " if source == 'Bilibili':\n", + " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", + " elif source == 'Osf':\n", + " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", + " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", + "\n", + "\n", + "def display_videos(video_ids, W=400, H=300, fs=1):\n", + " tab_contents = []\n", + " for i, video_id in enumerate(video_ids):\n", + " out = widgets.Output()\n", + " with out:\n", + " if video_ids[i][0] == 'Youtube':\n", + " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", + " height=H, fs=fs, rel=0)\n", + " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", + " else:\n", + " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", + " height=H, fs=fs, autoplay=False)\n", + " if video_ids[i][0] == 'Bilibili':\n", + " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", + " elif video_ids[i][0] == 'Osf':\n", + " print(f'Video available at https://osf.io/{video.id}')\n", + " display(video)\n", + " tab_contents.append(out)\n", + " return tab_contents\n", + "\n", + "\n", + "video_ids = [('Youtube', 'n1T8X0NiFqo'), ('Bilibili', 'BV1Ha4y1w73S')]\n", + "tab_contents = display_videos(video_ids, W=854, H=480)\n", + "tabs = widgets.Tab()\n", + "tabs.children = tab_contents\n", + "for i in range(len(tab_contents)):\n", + " tabs.set_title(i, video_ids[i][0])\n", + "display(tabs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_Conclusion_Video\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Play around with LLMs\n", + "\n", + "1. Try using LLMs' API to do tasks, such as utilizing the GPT-2 API to extend text from a provided context. To achieve this, ensure you have a HuggingFace account and secure an API token." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + " def query(payload, model_id, api_token):\n", + " headers = {\"Authorization\": f\"Bearer {api_token}\"}\n", + " API_URL = f\"https://api-inference.huggingface.co/models/{model_id}\"\n", + " response = requests.post(API_URL, headers=headers, json=payload)\n", + " return response.json()\n", + "\n", + " model_id = \"gpt2\"\n", + " api_token = \"hf_****\" # get yours at hf.co/settings/tokens\n", + " data = query(\"The goal of life is\", model_id, api_token)\n", + " print(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Try the following questions with [ChatGPT](https://openai.com/blog/chatgpt) (GPT3.5 without access to the web) and with GPTBing in creative mode (GPT4 with access to the web). Note that the latter requires installing Microsoft Edge.\n", + "\n", + " Pick someone you know who is likely to have a web presence but is not super famous (not Musk or Trump). Ask GPT for a two-paragraph biography. How good is it?\n", + "\n", + " Ask it something like “What is the US, UK, Germany, China, and Japan's per capita income over the past ten years? Plot the data in a single figure” (depending on when and where you run this, you will need to paste the resulting Python code into a colab notebook). Try asking it questions about the data or the definition of “per capita income” used. How good is it?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_Play_around_with_LLMs_Activity\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Summary\n", + "\n", + "In this tutorial you have become familiar with modern natural language processing (NLP) architectures. We learned about the core concepts, functionalities, and applications of these architectures. We also gain insights into prompt engineering and we learned about GPT." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Daily survey\n", + "\n", + "Don't forget to complete your reflections and content check in the daily survey! Please be patient after logging in as there is a small delay before you will be redirected to the survey.\n", + "\n", + "\"button" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Bonus Section: Using Large Language Models (LLMs)\n", + "\n", + "This videos tells you what large language models are being used for now and how you can use them. For instance, personalized tutoring, language practice, improving writing, exam preparation, writing help and data science." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Video 6: Using GPT\n", + "from ipywidgets import widgets\n", + "from IPython.display import YouTubeVideo\n", + "from IPython.display import IFrame\n", + "from IPython.display import display\n", + "\n", + "\n", + "class PlayVideo(IFrame):\n", + " def __init__(self, id, source, page=1, width=400, height=300, **kwargs):\n", + " self.id = id\n", + " if source == 'Bilibili':\n", + " src = f'https://player.bilibili.com/player.html?bvid={id}&page={page}'\n", + " elif source == 'Osf':\n", + " src = f'https://mfr.ca-1.osf.io/render?url=https://osf.io/download/{id}/?direct%26mode=render'\n", + " super(PlayVideo, self).__init__(src, width, height, **kwargs)\n", + "\n", + "\n", + "def display_videos(video_ids, W=400, H=300, fs=1):\n", + " tab_contents = []\n", + " for i, video_id in enumerate(video_ids):\n", + " out = widgets.Output()\n", + " with out:\n", + " if video_ids[i][0] == 'Youtube':\n", + " video = YouTubeVideo(id=video_ids[i][1], width=W,\n", + " height=H, fs=fs, rel=0)\n", + " print(f'Video available at https://youtube.com/watch?v={video.id}')\n", + " else:\n", + " video = PlayVideo(id=video_ids[i][1], source=video_ids[i][0], width=W,\n", + " height=H, fs=fs, autoplay=False)\n", + " if video_ids[i][0] == 'Bilibili':\n", + " print(f'Video available at https://www.bilibili.com/video/{video.id}')\n", + " elif video_ids[i][0] == 'Osf':\n", + " print(f'Video available at https://osf.io/{video.id}')\n", + " display(video)\n", + " tab_contents.append(out)\n", + " return tab_contents\n", + "\n", + "\n", + "video_ids = [('Youtube', 'JdXfuj6RP4Y'), ('Bilibili', 'BV1eX4y1v7c8')]\n", + "tab_contents = display_videos(video_ids, W=854, H=480)\n", + "tabs = widgets.Tab()\n", + "tabs.children = tab_contents\n", + "for i in range(len(tab_contents)):\n", + " tabs.set_title(i, video_ids[i][0])\n", + "display(tabs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Submit your feedback\n", + "content_review(f\"{feedback_prefix}_What_models_Video\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "gpuType": "T4", + "include_colab_link": true, + "name": "W3D1_Tutorial2", + "provenance": [], + "toc_visible": true + }, + "gpuClass": "standard", + "kernel": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "toc-autonumbering": true + }, + "nbformat": 4, + "nbformat_minor": 1 +}