From 4cef1e4013806998f6d3571dfa84c0ba76c3153e Mon Sep 17 00:00:00 2001 From: Soan Kim <39689481+SoanKim@users.noreply.github.com> Date: Sun, 6 Aug 2023 18:44:55 +0200 Subject: [PATCH 01/12] # Solved problems with the video # Students can easily modify the reward function and run it directly with Custom_LunarLander class --- .../ReinforcementLearning/lunar_lander.ipynb | 249 ++++++++++++++---- 1 file changed, 198 insertions(+), 51 deletions(-) diff --git a/projects/ReinforcementLearning/lunar_lander.ipynb b/projects/ReinforcementLearning/lunar_lander.ipynb index 5f58f38f8..8106a6c89 100644 --- a/projects/ReinforcementLearning/lunar_lander.ipynb +++ b/projects/ReinforcementLearning/lunar_lander.ipynb @@ -60,7 +60,9 @@ "# @title Update/Upgrade the system and install libs\n", "!apt-get update > /dev/null 2>&1\n", "!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1\n", - "!apt-get install -y swig build-essential python-dev python3-dev > /dev/null 2>&1" + "!apt-get install -y swig build-essential python-dev python3-dev > /dev/null 2>&1\n", + "!apt-get install x11-utils\n", + "!apt-get install xvfb" ] }, { @@ -75,23 +77,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m178.4/178.4 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m925.5/925.5 kB\u001b[0m \u001b[31m25.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m434.7/434.7 kB\u001b[0m \u001b[31m42.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m54.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for AutoROM.accept-rom-license (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m374.4/374.4 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.3/18.3 MB\u001b[0m \u001b[31m74.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m86.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Building wheel for box2d-py (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m853.1/853.1 kB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.1/103.1 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.9/13.9 MB\u001b[0m \u001b[31m79.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h" + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m178.4/178.4 kB\u001B[0m \u001B[31m4.6 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m925.5/925.5 kB\u001B[0m \u001B[31m25.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m434.7/434.7 kB\u001B[0m \u001B[31m42.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25h Installing build dependencies ... \u001B[?25l\u001B[?25hdone\n", + " Getting requirements to build wheel ... \u001B[?25l\u001B[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m1.7/1.7 MB\u001B[0m \u001B[31m54.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25h Building wheel for AutoROM.accept-rom-license (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m374.4/374.4 kB\u001B[0m \u001B[31m7.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25h Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m18.3/18.3 MB\u001B[0m \u001B[31m74.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m1.8/1.8 MB\u001B[0m \u001B[31m86.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25h Building wheel for box2d-py (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m853.1/853.1 kB\u001B[0m \u001B[31m12.8 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m103.1/103.1 kB\u001B[0m \u001B[31m3.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m13.9/13.9 MB\u001B[0m \u001B[31m79.6 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n", + "\u001B[?25h" ] } ], @@ -104,7 +106,11 @@ "!pip install pyvirtualdisplay --quiet\n", "!pip install pyglet --quiet\n", "!pip install pygame --quiet\n", - "!pip install minigrid --quiet" + "!pip install minigrid --quiet\n", + "!pip install -q swig --quiet\n", + "!pip install -q gymnasium[box2d] --quiet\n", + "!pip install 'minigrid<=2.1.1' --quiet\n", + "!pip3 install box2d-py --quiet" ] }, { @@ -154,6 +160,7 @@ "\n", "import gymnasium as gym\n", "from gym import spaces\n", + "from gym.envs.box2d.lunar_lander import *\n", "from gym.wrappers.monitoring.video_recorder import VideoRecorder" ] }, @@ -391,7 +398,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.10/dist-packages/gym/wrappers/monitoring/video_recorder.py:101: DeprecationWarning: \u001b[33mWARN: is marked as deprecated and will be removed in the future.\u001b[0m\n", + "/usr/local/lib/python3.10/dist-packages/gym/wrappers/monitoring/video_recorder.py:101: DeprecationWarning: \u001B[33mWARN: is marked as deprecated and will be removed in the future.\u001B[0m\n", " logger.deprecation(\n" ] }, @@ -551,7 +558,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.10/dist-packages/gym/wrappers/monitoring/video_recorder.py:101: DeprecationWarning: \u001b[33mWARN: is marked as deprecated and will be removed in the future.\u001b[0m\n", + "/usr/local/lib/python3.10/dist-packages/gym/wrappers/monitoring/video_recorder.py:101: DeprecationWarning: \u001B[33mWARN: is marked as deprecated and will be removed in the future.\u001B[0m\n", " logger.deprecation(\n" ] }, @@ -771,36 +778,176 @@ }, "outputs": [], "source": [ - "def step(self, actions):\n", - " ...\n", - " ...\n", - " ...\n", - " reward = 0\n", - " shaping = (\n", - " -100 * np.sqrt(state[0] * state[0] + state[1] * state[1])\n", - " - 100 * np.sqrt(state[2] * state[2] + state[3] * state[3])\n", - " - 100 * abs(state[4])\n", - " + 10 * state[6]\n", - " + 10 * state[7]\n", - " ) # And ten points for legs contact, the idea is if you\n", - " # lose contact again after landing, you get negative reward\n", - " if self.prev_shaping is not None:\n", - " reward = shaping - self.prev_shaping\n", - " self.prev_shaping = shaping\n", - "\n", - " reward -= (\n", - " m_power * 0.30\n", - " ) # less fuel spent is better, about -30 for heuristic landing. You should modify these values.\n", - " reward -= s_power * 0.03\n", - "\n", - " done = False\n", - " if self.game_over or abs(state[0]) >= 1.0:\n", - " done = True\n", - " reward = -100\n", - " if not self.lander.awake:\n", - " done = True\n", - " reward = +100\n", - " return np.array(state, dtype=np.float32), reward, done, {}" + "class Custom_LunarLander(LunarLander):\n", + "\n", + " def step(self, action):\n", + " assert self.lander is not None\n", + "\n", + " # Update wind\n", + " assert self.lander is not None, \"You forgot to call reset()\"\n", + " if self.enable_wind and not (\n", + " self.legs[0].ground_contact or self.legs[1].ground_contact\n", + " ):\n", + " # the function used for wind is tanh(sin(2 k x) + sin(pi k x)),\n", + " # which is proven to never be periodic, k = 0.01\n", + " wind_mag = (\n", + " math.tanh(\n", + " math.sin(0.02 * self.wind_idx)\n", + " + (math.sin(math.pi * 0.01 * self.wind_idx))\n", + " )\n", + " * self.wind_power\n", + " )\n", + " self.wind_idx += 1\n", + " self.lander.ApplyForceToCenter(\n", + " (wind_mag, 0.0),\n", + " True,\n", + " )\n", + "\n", + " # the function used for torque is tanh(sin(2 k x) + sin(pi k x)),\n", + " # which is proven to never be periodic, k = 0.01\n", + " torque_mag = math.tanh(\n", + " math.sin(0.02 * self.torque_idx)\n", + " + (math.sin(math.pi * 0.01 * self.torque_idx))\n", + " ) * (self.turbulence_power)\n", + " self.torque_idx += 1\n", + " self.lander.ApplyTorque(\n", + " (torque_mag),\n", + " True,\n", + " )\n", + "\n", + " if self.continuous:\n", + " action = np.clip(action, -1, +1).astype(np.float32)\n", + " else:\n", + " assert self.action_space.contains(\n", + " action\n", + " ), f\"{action!r} ({type(action)}) invalid \"\n", + "\n", + " # Engines\n", + " tip = (math.sin(self.lander.angle), math.cos(self.lander.angle))\n", + " side = (-tip[1], tip[0])\n", + " dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]\n", + "\n", + " m_power = 0.0\n", + " if (self.continuous and action[0] > 0.0) or (\n", + " not self.continuous and action == 2\n", + " ):\n", + " # Main engine\n", + " if self.continuous:\n", + " m_power = (np.clip(action[0], 0.0, 1.0) + 1.0) * 0.5 # 0.5..1.0\n", + " assert m_power >= 0.5 and m_power <= 1.0\n", + " else:\n", + " m_power = 1.0\n", + " # 4 is move a bit downwards, +-2 for randomness\n", + " ox = tip[0] * (4 / SCALE + 2 * dispersion[0]) + side[0] * dispersion[1]\n", + " oy = -tip[1] * (4 / SCALE + 2 * dispersion[0]) - side[1] * dispersion[1]\n", + " impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)\n", + " p = self._create_particle(\n", + " 3.5, # 3.5 is here to make particle speed adequate\n", + " impulse_pos[0],\n", + " impulse_pos[1],\n", + " m_power,\n", + " ) # particles are just a decoration\n", + " p.ApplyLinearImpulse(\n", + " (ox * MAIN_ENGINE_POWER * m_power, oy * MAIN_ENGINE_POWER * m_power),\n", + " impulse_pos,\n", + " True,\n", + " )\n", + " self.lander.ApplyLinearImpulse(\n", + " (-ox * MAIN_ENGINE_POWER * m_power, -oy * MAIN_ENGINE_POWER * m_power),\n", + " impulse_pos,\n", + " True,\n", + " )\n", + "\n", + " s_power = 0.0\n", + " if (self.continuous and np.abs(action[1]) > 0.5) or (\n", + " not self.continuous and action in [1, 3]\n", + " ):\n", + " # Orientation engines\n", + " if self.continuous:\n", + " direction = np.sign(action[1])\n", + " s_power = np.clip(np.abs(action[1]), 0.5, 1.0)\n", + " assert s_power >= 0.5 and s_power <= 1.0\n", + " else:\n", + " direction = action - 2\n", + " s_power = 1.0\n", + " ox = tip[0] * dispersion[0] + side[0] * (\n", + " 3 * dispersion[1] + direction * SIDE_ENGINE_AWAY / SCALE\n", + " )\n", + " oy = -tip[1] * dispersion[0] - side[1] * (\n", + " 3 * dispersion[1] + direction * SIDE_ENGINE_AWAY / SCALE\n", + " )\n", + " impulse_pos = (\n", + " self.lander.position[0] + ox - tip[0] * 17 / SCALE,\n", + " self.lander.position[1] + oy + tip[1] * SIDE_ENGINE_HEIGHT / SCALE,\n", + " )\n", + " p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)\n", + " p.ApplyLinearImpulse(\n", + " (ox * SIDE_ENGINE_POWER * s_power, oy * SIDE_ENGINE_POWER * s_power),\n", + " impulse_pos,\n", + " True,\n", + " )\n", + " self.lander.ApplyLinearImpulse(\n", + " (-ox * SIDE_ENGINE_POWER * s_power, -oy * SIDE_ENGINE_POWER * s_power),\n", + " impulse_pos,\n", + " True,\n", + " )\n", + "\n", + " self.world.Step(1.0 / FPS, 6 * 30, 2 * 30)\n", + "\n", + " pos = self.lander.position\n", + " vel = self.lander.linearVelocity\n", + " state = [\n", + " (pos.x - VIEWPORT_W / SCALE / 2) / (VIEWPORT_W / SCALE / 2),\n", + " (pos.y - (self.helipad_y + LEG_DOWN / SCALE)) / (VIEWPORT_H / SCALE / 2),\n", + " vel.x * (VIEWPORT_W / SCALE / 2) / FPS,\n", + " vel.y * (VIEWPORT_H / SCALE / 2) / FPS,\n", + " self.lander.angle,\n", + " 20.0 * self.lander.angularVelocity / FPS,\n", + " 1.0 if self.legs[0].ground_contact else 0.0,\n", + " 1.0 if self.legs[1].ground_contact else 0.0,\n", + " ]\n", + " assert len(state) == 8\n", + "\n", + " # Compare with / without shaping, referring the state description below\n", + " '''\n", + " state[0]: the horizontal coordinate\n", + " state[1]: the vertical coordinate\n", + " state[2]: the horizontal speed\n", + " state[3]: the vertical speed\n", + " state[4]: the angle\n", + " state[5]: the angular speed\n", + " state[6]: first leg contact\n", + " state[7]: second leg contact\n", + " '''\n", + " reward = 0\n", + " shaping = (\n", + " -100 * np.sqrt(state[0] * state[0] + state[1] * state[1])\n", + " - 100 * np.sqrt(state[2] * state[2] + state[3] * state[3])\n", + " - 100 * abs(state[4])\n", + " + 10 * state[6]\n", + " + 10 * state[7]\n", + " ) # And ten points for legs contact, the idea is if you\n", + " # lose contact again after landing, you get negative reward\n", + " if self.prev_shaping is not None:\n", + " reward = shaping - self.prev_shaping\n", + " self.prev_shaping = shaping\n", + "\n", + " reward -= (\n", + " m_power * 0.30\n", + " ) # less fuel spent is better, about -30 for heuristic landing\n", + " reward -= s_power * 0.03\n", + "\n", + " terminated = False\n", + " if self.game_over or abs(state[0]) >= 1.0:\n", + " terminated = True\n", + " reward = -100\n", + " if not self.lander.awake:\n", + " terminated = True\n", + " reward = +100\n", + "\n", + " if self.render_mode == \"human\":\n", + " self.render()\n", + " return np.array(state, dtype=np.float32), reward, terminated, False, {}" ] }, { From 4f22506213ac428d4db1024e8cf60cacc0b41964 Mon Sep 17 00:00:00 2001 From: Soan Kim <39689481+SoanKim@users.noreply.github.com> Date: Wed, 3 Jul 2024 18:41:04 +0900 Subject: [PATCH 02/12] # swig should be installed before gym[box2d] to avoid "error: subprocess-exited-with-error; ERROR: Failed building wheel for box2d-py ERROR: Could not build wheels for box2d-py, which is required to install pyproject.toml-based projects." --- projects/ReinforcementLearning/lunar_lander.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/ReinforcementLearning/lunar_lander.ipynb b/projects/ReinforcementLearning/lunar_lander.ipynb index 8106a6c89..7622f5b5a 100644 --- a/projects/ReinforcementLearning/lunar_lander.ipynb +++ b/projects/ReinforcementLearning/lunar_lander.ipynb @@ -102,12 +102,12 @@ "!pip install rarfile --quiet\n", "!pip install stable-baselines3[extra] --quiet\n", "!pip install ale-py --quiet\n", + "!pip install -q swig --quiet\n", "!pip install gym[box2d] --quiet\n", "!pip install pyvirtualdisplay --quiet\n", "!pip install pyglet --quiet\n", "!pip install pygame --quiet\n", "!pip install minigrid --quiet\n", - "!pip install -q swig --quiet\n", "!pip install -q gymnasium[box2d] --quiet\n", "!pip install 'minigrid<=2.1.1' --quiet\n", "!pip3 install box2d-py --quiet" From 7c4989e40cc9f105dd4e201b7b079845ce2e7570 Mon Sep 17 00:00:00 2001 From: dalia-nasr Date: Thu, 4 Jul 2024 00:50:24 +0300 Subject: [PATCH 03/12] updated dataset source for Twitter sentiment analysis template --- .../sentiment_analysis.ipynb | 1677 +---------------- 1 file changed, 1 insertion(+), 1676 deletions(-) diff --git a/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb b/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb index 7cc1ac416..a6f073666 100644 --- a/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb +++ b/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb @@ -1,1676 +1 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "execution": {}, - "id": "view-in-github" - }, - "source": [ - "\"Open   \"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "# Twitter Sentiment Analysis\n", - "\n", - "**By Neuromatch Academy**\n", - "\n", - "__Content creators:__ Juan Manuel Rodriguez, Salomey Osei, Gonzalo Uribarri\n", - "\n", - "__Production editors:__ Amita Kapoor, Spiros Chavlis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Welcome to the NLP project template\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Step 1: Questions and goals\n", - "\n", - "* Can we infer emotion from a tweet text?\n", - "* How words are distributed accross the dataset?\n", - "* Are words related to one kind of emotion?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Step 2: Literature review\n", - "\n", - "[Original Dataset Paper](https://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf)\n", - "\n", - "[Papers with code](https://paperswithcode.com/dataset/imdb-movie-reviews)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Step 3: Load and explore the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "execution": {} - }, - "outputs": [], - "source": [ - "# @title Install dependencies\n", - "!pip install pandas --quiet\n", - "!pip install torchtext --quiet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# We import some libraries to load the dataset\n", - "import os\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from collections import Counter\n", - "from tqdm.notebook import tqdm\n", - "\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import TensorDataset, DataLoader\n", - "\n", - "import torchtext\n", - "from torchtext.data import get_tokenizer\n", - "\n", - "from sklearn.utils import shuffle\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.feature_extraction.text import CountVectorizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "You can find the dataset we are going to use in [this website](http://help.sentiment140.com/for-students/)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "import requests, zipfile, io\n", - "url = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'\n", - "r = requests.get(url)\n", - "z = zipfile.ZipFile(io.BytesIO(r.content))\n", - "z.extractall()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
polarityiddatequeryusertext
001467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
101467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
201467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
301467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
401467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n", - "
" - ], - "text/plain": [ - " polarity ... text\n", - "0 0 ... @switchfoot http://twitpic.com/2y1zl - Awww, t...\n", - "1 0 ... is upset that he can't update his Facebook by ...\n", - "2 0 ... @Kenichan I dived many times for the ball. Man...\n", - "3 0 ... my whole body feels itchy and like its on fire \n", - "4 0 ... @nationwideclass no, it's not behaving at all....\n", - "\n", - "[5 rows x 6 columns]" - ] - }, - "execution_count": 4, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "# We load the dataset\n", - "header_list = [\"polarity\", \"id\", \"date\", \"query\", \"user\", \"text\"]\n", - "df = pd.read_csv('training.1600000.processed.noemoticon.csv',\n", - " encoding = \"ISO-8859-1\", names=header_list)\n", - "\n", - "# Let's have a look at it\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "For this project we will use only the text and the polarity of the tweet. Notice that polarity is 0 for negative tweets and 4 for positive tweet." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "X = df.text.values\n", - "\n", - "# Changes values from [0,4] to [0,1]\n", - "y = (df.polarity.values > 1).astype(int)\n", - "\n", - "\n", - "# Split the data into train and test\n", - "x_train_text, x_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "The first thing we have to do before working on the models is to familiarize ourselves with the dataset. This is called Exploratory Data Analisys (EDA)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1: @paisleypaisley LOL why do i get ideas so far in advance? it's not even june yet! we need a third knitter to have our own summer group \n", - "0: worst headache ever \n", - "0: @ewaniesciuszko i am so sad i wont see you! I miss you already. and yeah! that's perfect; i come back the 18th!\n", - "1: doesn't know how to spell conked \n", - "0: "So we stand here now and no one knows us at all I won't get used to this I won't get used to being gone"...I miss home and everyone -a\n" - ] - } - ], - "source": [ - "for s, l in zip(x_train_text[:5], y_train[:5]):\n", - " print('{}: {}'.format(l, s))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "An interesting thing to analyze is the Word Distribution. In order to count the occurrences of each word, we should tokenize the sentences first." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before Tokenize: worst headache ever \n", - "After Tokenize: ['worst', 'headache', 'ever']\n" - ] - } - ], - "source": [ - "tokenizer = get_tokenizer(\"basic_english\")\n", - "\n", - "print('Before Tokenize: ', x_train_text[1])\n", - "print('After Tokenize: ', tokenizer(x_train_text[1]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "499e7fb54aa048afb3cba78dd8d6bb0e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=1280000.0), HTML(value='')))" - ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fff9bd0ae74e46b0ad97ad980a834a58", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, max=320000.0), HTML(value='')))" - ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "x_train_token = [tokenizer(s) for s in tqdm(x_train_text)]\n", - "x_test_token = [tokenizer(s) for s in tqdm(x_test_text)]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We can count the words occurences and see how many different words are present in our dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of different Tokens in our Dataset: 669284\n", - "['.', 'i', '!', \"'\", 'to', 'the', ',', 'a', 'my', 'it', 'and', 'you', '?', 'is', 'for', 'in', 's', 'of', 't', 'on', 'that', 'me', 'so', 'have', 'm', 'but', 'just', 'with', 'be', 'at', 'not', 'was', 'this', 'now', 'can', 'good', 'up', 'day', 'all', 'get', 'out', 'like', 'are', 'no', 'go', 'http', '-', 'today', 'do', 'too', 'your', 'work', 'going', 'love', 'we', 'got', 'what', 'lol', 'time', 'back', 'from', 'u', 'one', 'will', 'know', 'about', 'im', 'really', 'don', 'am', 'had', ')', 'see', 'some', 'there', 'its', '&', 'how', 'if', 'still', 'they', '"', 'night', '(', 'well', 'want', 'new', 'think', '2', 'home', 'thanks', 'll', 'oh', 'when', 'as', 'he', 'more', 'here', 'much', 'off']\n" - ] - } - ], - "source": [ - "words = Counter()\n", - "for s in x_train_token:\n", - " for w in s:\n", - " words[w] += 1\n", - "\n", - "sorted_words = list(words.keys())\n", - "sorted_words.sort(key=lambda w: words[w], reverse=True)\n", - "print(f\"Number of different Tokens in our Dataset: {len(sorted_words)}\")\n", - "print(sorted_words[:100])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Now we can plot their distribution." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The 0.13970153178620734% most common words account for the 80.00532743602652% of the occurrences\n" - ] - } - ], - "source": [ - "count_occurences = sum(words.values())\n", - "\n", - "accumulated = 0\n", - "counter = 0\n", - "\n", - "while accumulated < count_occurences * 0.8:\n", - " accumulated += words[sorted_words[counter]]\n", - " counter += 1\n", - "\n", - "print(f\"The {counter * 100 / len(words)}% most common words \"\n", - " f\"account for the {accumulated * 100 / count_occurences}% of the occurrences\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEDCAYAAAAlRP8qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAARvUlEQVR4nO3dbZBeZ13H8e/PpkUBpWhWR5NqoqZoRBBcSxVHIuCYtk6jIzqNKOIU8sYiKj7EUYvWNyCOTzMFjFgrjLYWZDBDI1WxWkdt7VawNA2toVS6Fc3SFlQcLRn+vrhPmJvt7t5nk7O5d6/9fmZ29jxcPed/5kp/99nrPNypKiRJG9/nTLsASdIwDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEZMNdCTXJfkRJJ7erb//iT3Jjma5I/Wuj5J2kgyzfvQk3wb8N/AW6vqmRPa7gJuAl5YVY8l+eKqOnE26pSkjWCqZ+hVdRvw6PiyJF+V5D1J7kryt0m+plv1SuDaqnqs+28Nc0kasx7H0A8Br6qqbwR+Cnhjt/xC4MIkf5fk9iR7p1ahJK1DW6ZdwLgkTwW+BXh7klOLn9T93gLsAvYA24Hbknx9VX38bNcpSevRugp0Rn8xfLyqvmGJdfPAHVX1KeDDSe5nFPB3ns0CJWm9WldDLlX1n4zC+vsAMvLsbvW7GJ2dk2QroyGYB6ZRpyStR9O+bfEG4B+AZySZT3Il8FLgyiT/DBwF9nXNbwEeSXIvcCvw01X1yDTqlqT1aKq3LUqShrOuhlwkSadvahdFt27dWjt27JjW7iVpQ7rrrrs+VlUzS62bWqDv2LGDubm5ae1ekjakJP+63DqHXCSpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqRHr7X3ovew4ePNnph983WVTrESS1g/P0CWpEQa6JDXCQJekRhjoktSIiYGe5LokJ5Lcs0KbPUnen+Rokr8ZtkRJUh99ztCvB/YutzLJ+cAbgcur6uuA7xumNEnSakwM9Kq6DXh0hSY/ALyzqj7StT8xUG2SpFUYYgz9QuDpSf46yV1JXrZcwyQHkswlmVtYWBhg15KkU4YI9C3ANwKXAd8J/GKSC5dqWFWHqmq2qmZnZpb8SjxJ0mka4knReeCRqvok8MkktwHPBu4fYNuSpJ6GOEP/U+Bbk2xJ8mTgecCxAbYrSVqFiWfoSW4A9gBbk8wDrwXOBaiqN1fVsSTvAe4GPg28paqWvcVRkrQ2JgZ6Ve3v0eYNwBsGqUiSdFp8UlSSGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1IiJgZ7kuiQnkqz4LURJvinJySQvGa48SVJffc7Qrwf2rtQgyTnA64E/H6AmSdJpmBjoVXUb8OiEZq8C/gQ4MURRkqTVO+Mx9CTbgO8B3tSj7YEkc0nmFhYWznTXkqQxQ1wU/U3gZ6vq05MaVtWhqpqtqtmZmZkBdi1JOmXLANuYBW5MArAVuDTJyap61wDbliT1dMaBXlU7T00nuR54t2EuSWffxEBPcgOwB9iaZB54LXAuQFW9eU2rkyT1NjHQq2p/341V1cvPqBpJ0mnzSVFJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMmBnqS65KcSHLPMutfmuTuJB9I8vdJnj18mZKkSfqcoV8P7F1h/YeBF1TV1wO/AhwaoC5J0ir1+Qq625LsWGH934/N3g5sP/OyJEmrNfQY+pXAny23MsmBJHNJ5hYWFgbetSRtboMFepJvZxToP7tcm6o6VFWzVTU7MzMz1K4lSfQYcukjybOAtwCXVNUjQ2xTkrQ6Z3yGnuTLgXcCP1RV9595SZKk0zHxDD3JDcAeYGuSeeC1wLkAVfVm4Grgi4A3JgE4WVWza1WwJGlpfe5y2T9h/SuAVwxWkSTptPikqCQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpERMDPcl1SU4kuWeZ9Uny20mOJ7k7yXOHL1OSNEmfM/Trgb0rrL8E2NX9HADedOZlSZJWa2KgV9VtwKMrNNkHvLVGbgfOT/KlQxUoSepniDH0bcBDY/Pz3bInSHIgyVySuYWFhQF2LUk65axeFK2qQ1U1W1WzMzMzZ3PXktS8IQL9YeCCsfnt3TJJ0lk0RKAfBl7W3e1yMfCJqvroANuVJK3ClkkNktwA7AG2JpkHXgucC1BVbwaOAJcCx4H/AX5krYqVJC1vYqBX1f4J6wv40cEqkiSdFp8UlaRGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqRG9Aj3J3iT3JTme5OAS6788ya1J3pfk7iSXDl+qJGklEwM9yTnAtcAlwG5gf5Ldi5r9AnBTVT0HuAJ449CFSpJW1ucM/SLgeFU9UFWPAzcC+xa1KeALuumnAf82XImSpD4mfgUdsA14aGx+Hnjeoja/BPx5klcBTwFePEh1kqTehroouh+4vqq2M/rC6LclecK2kxxIMpdkbmFhYaBdS5KgX6A/DFwwNr+9WzbuSuAmgKr6B+Bzga2LN1RVh6pqtqpmZ2ZmTq9iSdKS+gT6ncCuJDuTnMfooufhRW0+ArwIIMnXMgp0T8El6SyaGOhVdRK4CrgFOMbobpajSa5JcnnX7DXAK5P8M3AD8PKqqrUqWpL0RH0uilJVR4Aji5ZdPTZ9L/D8YUvrZ8fBmz8z/eDrLptGCZK0LvikqCQ1wkCXpEYY6JLUiOYCfcfBmz9rXF2SNovmAl2SNisDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJakSvQE+yN8l9SY4nObhMm+9Pcm+So0n+aNgyJUmTTPzGoiTnANcC3wHMA3cmOdx9S9GpNruAnwOeX1WPJfnitSpYkrS0PmfoFwHHq+qBqnocuBHYt6jNK4Frq+oxgKo6MWyZkqRJ+gT6NuChsfn5btm4C4ELk/xdktuT7F1qQ0kOJJlLMrewsHB6FUuSljTURdEtwC5gD7Af+N0k5y9uVFWHqmq2qmZnZmYG2rUkCfoF+sPABWPz27tl4+aBw1X1qar6MHA/o4CXJJ0lfQL9TmBXkp1JzgOuAA4vavMuRmfnJNnKaAjmgQHrlCRNMDHQq+okcBVwC3AMuKmqjia5JsnlXbNbgEeS3AvcCvx0VT2yVkVLkp5o4m2LAFV1BDiyaNnVY9MF/GT3I0maAp8UlaRGGOiS1AgDXZIaYaBLUiMMdElqRNOBvuPgzew4ePO0y5Cks6LpQJekzcRAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY3YNIHuPemSWrdpAl2SWmegS1IjegV6kr1J7ktyPMnBFdp9b5JKMjtciZKkPiYGepJzgGuBS4DdwP4ku5do9/nAq4E7hi5yaI6nS2pRnzP0i4DjVfVAVT0O3AjsW6LdrwCvB/53wPokST31CfRtwENj8/Pdss9I8lzggqpa8bQ3yYEkc0nmFhYWVl2sJGl5Z3xRNMnnAL8OvGZS26o6VFWzVTU7MzNzprsehMMvklrRJ9AfBi4Ym9/eLTvl84FnAn+d5EHgYuCwF0Yl6ezqE+h3AruS7ExyHnAFcPjUyqr6RFVtraodVbUDuB24vKrm1qRiSdKSJgZ6VZ0ErgJuAY4BN1XV0STXJLl8rQuUJPWzpU+jqjoCHFm07Opl2u4587IkSavlk6KS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQb6GF8DIGkjM9AlqREGuiQ1wkCXpEYY6JLUCANdkhphoC/DO14kbTQGuiQ1wkDvYfxsfblpSZo2A12SGtEr0JPsTXJfkuNJDi6x/ieT3Jvk7iTvTfIVw5cqSVrJxEBPcg5wLXAJsBvYn2T3ombvA2ar6lnAO4BfHbpQSdLK+pyhXwQcr6oHqupx4EZg33iDqrq1qv6nm70d2D5smZKkSfoE+jbgobH5+W7Zcq4E/mypFUkOJJlLMrewsNC/yg3AC6SSpq3Xl0T3leQHgVngBUutr6pDwCGA2dnZGnLf68l4sD/4usumWImkzaRPoD8MXDA2v71b9lmSvBj4eeAFVfV/w5QnSeqrT6DfCexKspNRkF8B/MB4gyTPAX4H2FtVJwavcgNbbhjGM3dJQ5s4hl5VJ4GrgFuAY8BNVXU0yTVJLu+avQF4KvD2JO9PcnjNKpYkLanXGHpVHQGOLFp29dj0iweuS5K0SoNeFFV/XjiVNDQDfR1wnF3SEAz0dcygl7QaBvoG5HCNpKX4tsUG+EpfSWCgN81wlzYXA32T8Cxeap+BvskZ7lI7DHR9Rp+v2vMDQFq/DHSdNsNdWl+8bVGDOBXsD77uslWHvLdeSsMw0DV1p/MBsNQHiB8M2uwcclFTVnsdYKhpaT3wDF0awFoHu399qA8DXdoglhtmmvb0mfLDajgGuqSp6vPBsB4+uDbCh1ivMfQke5Pcl+R4koNLrH9Skj/u1t+RZMfQhUqSVjYx0JOcA1wLXALsBvYn2b2o2ZXAY1X11cBvAK8fulBJ0sr6nKFfBByvqgeq6nHgRmDfojb7gD/opt8BvChJhitTkjRJqmrlBslLgL1V9Ypu/oeA51XVVWNt7unazHfzH+rafGzRtg4AB7rZZwD3nWH9W4GPTWzVFo95c/CYN4fTOeavqKqZpVac1YuiVXUIODTU9pLMVdXsUNvbCDzmzcFj3hyGPuY+Qy4PAxeMzW/vli3ZJskW4GnAI0MUKEnqp0+g3wnsSrIzyXnAFcDhRW0OAz/cTb8E+KuaNJYjSRrUxCGXqjqZ5CrgFuAc4LqqOprkGmCuqg4Dvwe8Lclx4FFGoX82DDZ8s4F4zJuDx7w5DHrMEy+KSpI2Bl/OJUmNMNAlqREbMtAnvYqgBUkuSHJrknuTHE3y6m75Fyb5iyT/0v1++rRrHVqSc5K8L8m7u/md3SsljnevmDhv2jUOKcn5Sd6R5INJjiX55tb7OclPdP+u70lyQ5LPba2fk1yX5ET3nM6pZUv2a0Z+uzv2u5M893T2ueECveerCFpwEnhNVe0GLgZ+tDvOg8B7q2oX8N5uvjWvBo6Nzb8e+I3u1RKPMXrVREt+C3hPVX0N8GxGx95sPyfZBvwYMFtVz2R0s8UVtNfP1wN7Fy1brl8vAXZ1PweAN53ODjdcoNPvVQQbXlV9tKr+qZv+L0b/k2/js1+z8AfAd0+nwrWRZDtwGfCWbj7ACxm9UgIaO+YkTwO+jdGdYlTV41X1cRrvZ0Z32H1e99zKk4GP0lg/V9VtjO76G7dcv+4D3lojtwPnJ/nS1e5zIwb6NuChsfn5blmzurdXPge4A/iSqvpot+rfgS+ZUllr5TeBnwE+3c1/EfDxqjrZzbfW3zuBBeD3u2GmtyR5Cg33c1U9DPwa8BFGQf4J4C7a7udTluvXQXJtIwb6ppLkqcCfAD9eVf85vq57eKuZ+06TfBdwoqrumnYtZ9EW4LnAm6rqOcAnWTS80mA/P53RGelO4MuAp/DEoYnmrUW/bsRA7/MqgiYkOZdRmP9hVb2zW/wfp/4U636fmFZ9a+D5wOVJHmQ0lPZCRuPL53d/mkN7/T0PzFfVHd38OxgFfMv9/GLgw1W1UFWfAt7JqO9b7udTluvXQXJtIwZ6n1cRbHjd2PHvAceq6tfHVo2/ZuGHgT8927Wtlar6uaraXlU7GPXrX1XVS4FbGb1SAto75n8HHkryjG7Ri4B7abifGQ21XJzkyd2/81PH3Gw/j1muXw8DL+vudrkY+MTY0Ex/VbXhfoBLgfuBDwE/P+161ugYv5XRn2N3A+/vfi5lNKb8XuBfgL8EvnData7R8e8B3t1NfyXwj8Bx4O3Ak6Zd38DH+g3AXNfX7wKe3no/A78MfBC4B3gb8KTW+hm4gdE1gk8x+kvsyuX6FQiju/c+BHyA0R1Aq96nj/5LUiM24pCLJGkJBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqxP8D+q4d+O9Hiz8AAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.bar(range(100), [words[w] for w in sorted_words[:100]])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "It is very common to find this kind of distribution when analyzing corpus of text. This is referred to as the [zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Usually the number of words in the dictionary will be very large. \n", - "\n", - "Here are some thing we can do to reduce that number:\n", - "\n", - "* Remove puntuation.\n", - "* Remove stop-words.\n", - "* Steaming.\n", - "* Remove very uncommon words (the words that appears in fewer than N occations).\n", - "* Nothing: we can use a pretrain model that handles this kind of situations.\n", - "\n", - "\n", - "We used one of the simplest tokenizers availables. This tokenizer does not take into account many quirks of the language. Moreover, diferent languages have different quirks, so there is no \"universal\" tokenizers. There are many libraries that have \"better\" tokenizers:\n", - "\n", - "* [Spacy](https://spacy.io/): it can be accessed using: `get_tokenizer(\"spacy\")`. Spacy supports a wide range of languages.\n", - "* [Huggingface](https://huggingface.co/): it has many tokenizers for different laguages. [Doc](https://huggingface.co/transformers/main_classes/tokenizer.html)\n", - "* [NLTK](https://www.nltk.org/): it provides several tokenizers. One of them can be accessed using: `get_tokenizer(\"toktok\")`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# Step 4: choose toolkit\n", - "\n", - "Our goal is to train a model capable of estimating the sentiment of a tweet (positive or negative) by reading its content. To that end we will try 2 different approaches:\n", - "\n", - "* A logistic regression using sklearn. **NOTE**: it can probaly work better than an SVM model.\n", - "* A simple Embedding + RNN." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Logistic regression\n", - "\n", - "We will represent our senteces using binary vectorization. This means that our data would be represented as a matrix of instances by word with a one if the word is in the instance, and zero otherwise. Sklean vectorizers can also do things such as stop-word removal and puntuation removal, you can read more about in [the documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "vectorizer = CountVectorizer(binary=True)\n", - "x_train_cv = vectorizer.fit_transform(x_train_text)\n", - "x_test_cv = vectorizer.transform(x_test_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before Vectorize: doesn't know how to spell conked \n" - ] - } - ], - "source": [ - "print('Before Vectorize: ', x_train_text[3])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "After Vectorize: \n", - " (0, 528584)\t1\n", - " (0, 165468)\t1\n", - " (0, 300381)\t1\n", - " (0, 242211)\t1\n", - " (0, 489893)\t1\n", - " (0, 134160)\t1\n" - ] - } - ], - "source": [ - "# Notice that the matriz is sparse\n", - "print('After Vectorize: ')\n", - "print(x_train_cv[3])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Now we can train our model. You can check the documentation of this logistic regressor [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic#sklearn.linear_model.LogisticRegression)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "data": { - "text/plain": [ - "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", - " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", - " multi_class='auto', n_jobs=None, penalty='l2',\n", - " random_state=None, solver='saga', tol=0.0001, verbose=0,\n", - " warm_start=False)" - ] - }, - "execution_count": 15, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "model = LogisticRegression(solver='saga')\n", - "model.fit(x_train_cv, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.81 0.79 0.80 160000\n", - " 1 0.79 0.81 0.80 160000\n", - "\n", - " accuracy 0.80 320000\n", - " macro avg 0.80 0.80 0.80 320000\n", - "weighted avg 0.80 0.80 0.80 320000\n", - "\n" - ] - } - ], - "source": [ - "y_pred = model.predict(x_test_cv)\n", - "\n", - "print(classification_report(y_test, y_pred))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Explainable AI\n", - "The best thing about logistic regresion is that it is simple, and we can get some explanations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1, 589260)\n", - "589260\n" - ] - } - ], - "source": [ - "print(model.coef_.shape)\n", - "print(len(vectorizer.vocabulary_))\n", - "\n", - "words_sk = list(vectorizer.vocabulary_.keys())\n", - "words_sk.sort(key=lambda w: model.coef_[0, vectorizer.vocabulary_[w]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "roni: -3.862597673594883\n", - "inaperfectworld: -3.5734362290886375\n", - "dontyouhate: -3.500197620227523\n", - "xbllygbsn: -3.412645372640648\n", - "anqju: -3.336405291553548\n", - "sad: -3.200522312464158\n", - "pakcricket: -3.1949158120163412\n", - "condolences: -3.132498019366488\n", - "heartbreaking: -3.066508733796654\n", - "saddest: -3.041999809733714\n", - "sadd: -3.029070563580306\n", - "heartbroken: -3.0287688233900174\n", - "boohoo: -3.022608649696793\n", - "sadface: -2.9918411285807234\n", - "rachelle_lefevr: -2.925057253107806\n", - "disappointing: -2.902524113779547\n", - "lvbu: -2.894705935001672\n", - "saddens: -2.8855127179984654\n", - "bummed: -2.83650014970307\n", - "neda: -2.792944556837498\n" - ] - } - ], - "source": [ - "for w in words_sk[:20]:\n", - " print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "iamsoannoyed: 2.8494314732277672\n", - "myfax: 2.797451563471618\n", - "jennamadison: 2.5667257393706113\n", - "yeyy: 2.478028598852801\n", - "tryout: 2.4383315790116677\n", - "goldymom: 2.4374026022205535\n", - "wooohooo: 2.40297322137544\n", - "thesupergirl: 2.3565118467330004\n", - "iammaxathotspot: 2.311648368632618\n", - "londicreations: 2.3074490293400993\n", - "smilin: 2.2991891636718216\n", - "worries: 2.2899429774914717\n", - "sinfulsignorita: 2.2798963640981817\n", - "finchensnail: 2.264302079155878\n", - "smackthis: 2.2376679263761083\n", - "kv: 2.2158393907798413\n", - "tojosan: 2.211784259253832\n", - "russmarshalek: 2.2095374025599384\n", - "traciknoppe: 2.1768297770350835\n", - "congratulations: 2.171590496227557\n" - ] - } - ], - "source": [ - "for w in reversed(words_sk[-20:]):\n", - " print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "What does this mean?\n", - "\n", - "Remember the `model.coef_` is the $W$ in:\n", - "\n", - "$$h(x)=\\sigma(WX + b)$$\n", - "\n", - "where the label 1 is a positive tweet and the label 0 is a negative tweet." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "## Recurrent Neural Network with Pytorch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "In the previous section we use a Bag-Of-Words approach to represent each of the tweets. That meas that we only consider how many times each of the words appear in each of the tweets, we didnt take into account the order of the words. But we know that the word order is very important and carries relevant information.\n", - "\n", - "In this section we will solve the same task, but this time we will implement a Recurrent Neural Network (RNN) instead of using a simple Logistic Regression.Unlike feedforward neural networks, RNNs have cyclic connections making them powerful for modeling sequences.\n", - "\n", - "Let's start by importing the relevant libraries.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "def set_device():\n", - " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - " if device != \"cuda\":\n", - " print(\"WARNING: For this notebook to perform best, \"\n", - " \"if possible, in the menu under `Runtime` -> \"\n", - " \"`Change runtime type.` select `GPU` \")\n", - " else:\n", - " print(\"GPU is enabled in this notebook.\")\n", - "\n", - " return device" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GPU is enabled in this notebook.\n" - ] - } - ], - "source": [ - "# Set the device (check if gpu is available)\n", - "device = set_device()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "First we will create a Dictionary (`word_to_idx`). This dictionary will map each Token (usually words) to an index (an integer number). We want to limit our dictionary to a certain number of tokens (`num_words_dict`), so we will include in our ditionary those with more occurrences." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['.', 'i', '!', \"'\", 'to', 'the', ',', 'a', 'my', 'it']" - ] - }, - "execution_count": 22, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "# From previous section, we have a list with the most used tokens\n", - "sorted_words[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Let's select only the most used." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "num_words_dict = 30000\n", - "# We reserve two numbers for special tokens.\n", - "most_used_words = sorted_words[:num_words_dict-2]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We will add two extra Tokens to the dictionary, one for words outside the dictionary (`'UNK'`) and one for padding the sequences (`'PAD'`)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# dictionary to go from words to idx\n", - "word_to_idx = {}\n", - "# dictionary to go from idx to words (just in case)\n", - "idx_to_word = {}\n", - "\n", - "\n", - "# We include the special tokens first\n", - "PAD_token = 0\n", - "UNK_token = 1\n", - "\n", - "word_to_idx['PAD'] = PAD_token\n", - "word_to_idx['UNK'] = UNK_token\n", - "\n", - "idx_to_word[PAD_token] = 'PAD'\n", - "idx_to_word[UNK_token] = 'UNK'\n", - "\n", - "# We popullate our dictionaries with the most used words\n", - "for num,word in enumerate(most_used_words):\n", - " word_to_idx[word] = num + 2\n", - " idx_to_word[num+2] = word" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Our goal now is to transform each tweet from a sequence of tokens to a sequence of indexes. These sequences of indexes will be the input to our pytorch model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# A function to convert list of tokens to list of indexes\n", - "def tokens_to_idx(sentences_tokens,word_to_idx):\n", - " sentences_idx = []\n", - " for sent in sentences_tokens:\n", - " sent_idx = []\n", - " for word in sent:\n", - " if word in word_to_idx:\n", - " sent_idx.append(word_to_idx[word])\n", - " else:\n", - " sent_idx.append(word_to_idx['UNK'])\n", - " sentences_idx.append(sent_idx)\n", - " return sentences_idx" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "x_train_idx = tokens_to_idx(x_train_token,word_to_idx)\n", - "x_test_idx = tokens_to_idx(x_test_token,word_to_idx)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before converting: ['worst', 'headache', 'ever']\n", - "After converting: [721, 458, 237]\n" - ] - } - ], - "source": [ - "some_number = 1\n", - "print('Before converting: ', x_train_token[some_number])\n", - "print('After converting: ', x_train_idx[some_number])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We need all the sequences to have the same length. To select an adequate sequence length, let's explore some statistics about the length of the tweets:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Max tweet word length: 229\n", - "Mean tweet word length: 15.0\n", - "99% percent under: 37.0\n" - ] - } - ], - "source": [ - "tweet_lens = np.asarray([len(sentence) for sentence in x_train_idx])\n", - "print('Max tweet word length: ',tweet_lens.max())\n", - "print('Mean tweet word length: ',np.median(tweet_lens))\n", - "print('99% percent under: ',np.quantile(tweet_lens,0.99))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We cut the sequences which are larger than our chosen maximum length (`max_lenght`) and fill with zeros the ones that are shorter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - " # We choose the max length\n", - " max_length = 40\n", - "\n", - "# A function to make all the sequence have the same lenght\n", - "# Note that the output is a Numpy matrix\n", - " def padding(sentences, seq_len):\n", - " features = np.zeros((len(sentences), seq_len),dtype=int)\n", - " for ii, tweet in enumerate(sentences):\n", - " len_tweet = len(tweet)\n", - " if len_tweet != 0:\n", - " if len_tweet <= seq_len:\n", - " # If its shorter, we fill with zeros (the padding Token index)\n", - " features[ii, -len(tweet):] = np.array(tweet)[:seq_len]\n", - " if len_tweet > seq_len:\n", - " # If its larger, we take the last 'seq_len' indexes\n", - " features[ii, :] = np.array(tweet)[-seq_len:]\n", - " return features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# We convert our list of tokens into a numpy matrix\n", - "# where all instances have the same lenght\n", - "x_train_pad = padding(x_train_idx,max_length)\n", - "x_test_pad = padding(x_test_idx,max_length)\n", - "\n", - "# We convert our target list a numpy matrix\n", - "y_train_np = np.asarray(y_train)\n", - "y_test_np = np.asarray(y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before padding: [1, 3, 71, 24, 122, 3, 533, 74, 13, 4, 3, 102, 13, 209, 2, 12, 150, 4, 22, 5, 18, 667, 3, 138, 61, 7, 3296, 4]\n", - "After padding: [ 0 0 0 0 0 0 0 0 0 0 0 0 1 3\n", - " 71 24 122 3 533 74 13 4 3 102 13 209 2 12\n", - " 150 4 22 5 18 667 3 138 61 7 3296 4]\n" - ] - } - ], - "source": [ - "some_number = 2\n", - "print('Before padding: ', x_train_idx[some_number])\n", - "print('After padding: ', x_train_pad[some_number])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Now, let's convert the data to pytorch format.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# create Tensor datasets\n", - "train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train_np))\n", - "valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test_np))\n", - "\n", - "# Batch size (this is an important hyperparameter)\n", - "batch_size = 100\n", - "\n", - "# dataloaders\n", - "# make sure to SHUFFLE your data\n", - "train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last = True)\n", - "valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Each batch of data in our traning proccess will have the folllowing format:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sample input size: torch.Size([100, 40])\n", - "Sample input: \n", - " tensor([[ 0, 0, 0, ..., 4, 4, 4],\n", - " [ 0, 0, 0, ..., 7447, 14027, 2],\n", - " [ 0, 0, 0, ..., 100, 22241, 4],\n", - " ...,\n", - " [ 0, 0, 0, ..., 2702, 4409, 2],\n", - " [ 0, 0, 0, ..., 162, 17, 1],\n", - " [ 0, 0, 0, ..., 67, 12904, 49]])\n", - "Sample input: \n", - " tensor([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,\n", - " 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,\n", - " 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,\n", - " 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,\n", - " 0, 0, 1, 0])\n" - ] - } - ], - "source": [ - "# Obtain one batch of training data\n", - "dataiter = iter(train_loader)\n", - "sample_x, sample_y = dataiter.next()\n", - "\n", - "print('Sample input size: ', sample_x.size()) # batch_size, seq_length\n", - "print('Sample input: \\n', sample_x)\n", - "print('Sample input: \\n', sample_y)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "Now, we will define the `SentimentRNN` class. Most of the model's class will be familiar to you, but there are two important layers we would like you to pay attention to:\n", - "\n", - "* Embedding Layer\n", - "> This layer is like a linear layer, but it makes it posible to use a sequence of inedexes as inputs (instead of a sequence of one-hot-encoded vectors). During training, the Embedding layer learns a linear transformation from the space of words (a vector space of dimension `num_words_dict`) into the a new, smaller, vector space of dimension `embedding_dim`. We suggest you to read this [thread](https://discuss.pytorch.org/t/how-does-nn-embedding-work/88518/3) and the [pytorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) if you want to learn more about this particular kind of layers.\n", - "\n", - "\n", - "* LSTM layer\n", - "> This is one of the most used class of Recurrent Neural Networks. In Pytorch we can add several stacked layers in just one line of code. In our case, the number of layers added are decided with the parameter `no_layers`. If you want to learn more about LSTMs we strongly recommend you this [Colahs thread](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) about them.\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "class SentimentRNN(nn.Module):\n", - " def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.1):\n", - " super(SentimentRNN,self).__init__()\n", - "\n", - " self.output_dim = output_dim\n", - " self.hidden_dim = hidden_dim\n", - " self.no_layers = no_layers\n", - " self.vocab_size = vocab_size\n", - " self.drop_prob = drop_prob\n", - "\n", - " # Embedding Layer\n", - " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", - "\n", - " # LSTM Layers\n", - " self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,\n", - " num_layers=no_layers, batch_first=True,\n", - " dropout=self.drop_prob)\n", - "\n", - " # Dropout layer\n", - " self.dropout = nn.Dropout(drop_prob)\n", - "\n", - " # Linear and Sigmoid layer\n", - " self.fc = nn.Linear(self.hidden_dim, output_dim)\n", - " self.sig = nn.Sigmoid()\n", - "\n", - " def forward(self,x,hidden):\n", - " batch_size = x.size(0)\n", - "\n", - " # Embedding out\n", - " embeds = self.embedding(x)\n", - " #Shape: [batch_size x max_length x embedding_dim]\n", - "\n", - " # LSTM out\n", - " lstm_out, hidden = self.lstm(embeds, hidden)\n", - " # Shape: [batch_size x max_length x hidden_dim]\n", - "\n", - " # Select the activation of the last Hidden Layer\n", - " lstm_out = lstm_out[:,-1,:].contiguous()\n", - " # Shape: [batch_size x hidden_dim]\n", - "\n", - " ## You can instead average the activations across all the times\n", - " # lstm_out = torch.mean(lstm_out, 1).contiguous()\n", - "\n", - " # Dropout and Fully connected layer\n", - " out = self.dropout(lstm_out)\n", - " out = self.fc(out)\n", - "\n", - " # Sigmoid function\n", - " sig_out = self.sig(out)\n", - "\n", - " # return last sigmoid output and hidden state\n", - " return sig_out, hidden\n", - "\n", - " def init_hidden(self, batch_size):\n", - " ''' Initializes hidden state '''\n", - " # Create two new tensors with sizes n_layers x batch_size x hidden_dim,\n", - " # initialized to zero, for hidden state and cell state of LSTM\n", - " h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n", - " c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n", - " hidden = (h0,c0)\n", - " return hidden" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We choose the parameters of the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# Parameters of our network\n", - "\n", - "# Size of our vocabulary\n", - "vocab_size = num_words_dict\n", - "\n", - "# Embedding dimension\n", - "embedding_dim = 32\n", - "\n", - "# Number of stacked LSTM layers\n", - "no_layers = 2\n", - "\n", - "# Dimension of the hidden layer in LSTMs\n", - "hidden_dim = 64\n", - "\n", - "# Dropout parameter for regularization\n", - "output_dim = 1\n", - "\n", - "# Dropout parameter for regularization\n", - "drop_prob = 0.25" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SentimentRNN(\n", - " (embedding): Embedding(30000, 32)\n", - " (lstm): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.25)\n", - " (dropout): Dropout(p=0.25, inplace=False)\n", - " (fc): Linear(in_features=64, out_features=1, bias=True)\n", - " (sig): Sigmoid()\n", - ")\n" - ] - } - ], - "source": [ - "# Let's define our model\n", - "model = SentimentRNN(no_layers, vocab_size, hidden_dim,\n", - " embedding_dim, drop_prob=drop_prob)\n", - "# Moving to gpu\n", - "model.to(device)\n", - "print(model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total Number of parameters: 1018433\n" - ] - } - ], - "source": [ - "# How many trainable parameters does our model have?\n", - "model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n", - "params = sum([np.prod(p.size()) for p in model_parameters])\n", - "print('Total Number of parameters: ',params)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We choose the losses and the optimizer for the training procces." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [], - "source": [ - "# loss and optimization functions\n", - "lr = 0.001\n", - "\n", - "# Binary crossentropy is a good loss function for a binary classification problem\n", - "criterion = nn.BCELoss()\n", - "\n", - "# We choose an Adam optimizer\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", - "\n", - "# function to predict accuracy\n", - "def acc(pred,label):\n", - " pred = torch.round(pred.squeeze())\n", - " return torch.sum(pred == label.squeeze()).item()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "We are ready to train our model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1\n", - "train_loss : 0.4367361353733577 val_loss : 0.39174133955966683\n", - "train_accuracy : 79.530625 val_accuracy : 82.3628125\n", - "Validation loss decreased (inf --> 0.391741). Saving model ...\n", - "==================================================\n", - "Epoch 2\n", - "train_loss : 0.3765802335098851 val_loss : 0.3724124691961333\n", - "train_accuracy : 83.19140625 val_accuracy : 83.42031250000001\n", - "Validation loss decreased (0.391741 --> 0.372412). Saving model ...\n", - "==================================================\n", - "Epoch 3\n", - "train_loss : 0.35746844720793886 val_loss : 0.365050206175074\n", - "train_accuracy : 84.16882812499999 val_accuracy : 83.7440625\n", - "Validation loss decreased (0.372412 --> 0.365050). Saving model ...\n", - "==================================================\n", - "Epoch 4\n", - "train_loss : 0.34491546426317654 val_loss : 0.36467386982403693\n", - "train_accuracy : 84.879140625 val_accuracy : 83.77\n", - "Validation loss decreased (0.365050 --> 0.364674). Saving model ...\n", - "==================================================\n", - "Epoch 5\n", - "train_loss : 0.33429012800217606 val_loss : 0.36189084346871825\n", - "train_accuracy : 85.44296875 val_accuracy : 84.0221875\n", - "Validation loss decreased (0.364674 --> 0.361891). Saving model ...\n", - "==================================================\n" - ] - } - ], - "source": [ - "# Number of training Epochs\n", - "epochs = 5\n", - "\n", - "# Maximum absolute value accepted for the gradeint\n", - "clip = 5\n", - "\n", - "# Initial Loss value (assumed big)\n", - "valid_loss_min = np.Inf\n", - "\n", - "# Lists to follow the evolution of the loss and accuracy\n", - "epoch_tr_loss,epoch_vl_loss = [],[]\n", - "epoch_tr_acc,epoch_vl_acc = [],[]\n", - "\n", - "# Train for a number of Epochs\n", - "for epoch in range(epochs):\n", - " train_losses = []\n", - " train_acc = 0.0\n", - " model.train()\n", - "\n", - " for inputs, labels in train_loader:\n", - "\n", - " # Initialize hidden state\n", - " h = model.init_hidden(batch_size)\n", - " # Creating new variables for the hidden state\n", - " h = tuple([each.data.to(device) for each in h])\n", - "\n", - " # Move batch inputs and labels to gpu\n", - " inputs, labels = inputs.to(device), labels.to(device)\n", - "\n", - " # Set gradient to zero\n", - " model.zero_grad()\n", - "\n", - " # Compute model output\n", - " output,h = model(inputs,h)\n", - "\n", - " # Calculate the loss and perform backprop\n", - " loss = criterion(output.squeeze(), labels.float())\n", - " loss.backward()\n", - " train_losses.append(loss.item())\n", - "\n", - " # calculating accuracy\n", - " accuracy = acc(output,labels)\n", - " train_acc += accuracy\n", - "\n", - " #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.\n", - " nn.utils.clip_grad_norm_(model.parameters(), clip)\n", - " optimizer.step()\n", - "\n", - "\n", - " # Evaluate on the validation set for this epoch\n", - " val_losses = []\n", - " val_acc = 0.0\n", - " model.eval()\n", - " for inputs, labels in valid_loader:\n", - "\n", - " # Initialize hidden state\n", - " val_h = model.init_hidden(batch_size)\n", - " val_h = tuple([each.data.to(device) for each in val_h])\n", - "\n", - " # Move batch inputs and labels to gpu\n", - " inputs, labels = inputs.to(device), labels.to(device)\n", - "\n", - " # Compute model output\n", - " output, val_h = model(inputs, val_h)\n", - "\n", - " # Compute Loss\n", - " val_loss = criterion(output.squeeze(), labels.float())\n", - "\n", - " val_losses.append(val_loss.item())\n", - "\n", - " accuracy = acc(output,labels)\n", - " val_acc += accuracy\n", - "\n", - " epoch_train_loss = np.mean(train_losses)\n", - " epoch_val_loss = np.mean(val_losses)\n", - " epoch_train_acc = train_acc/len(train_loader.dataset)\n", - " epoch_val_acc = val_acc/len(valid_loader.dataset)\n", - " epoch_tr_loss.append(epoch_train_loss)\n", - " epoch_vl_loss.append(epoch_val_loss)\n", - " epoch_tr_acc.append(epoch_train_acc)\n", - " epoch_vl_acc.append(epoch_val_acc)\n", - " print(f'Epoch {epoch+1}')\n", - " print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')\n", - " print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')\n", - " if epoch_val_loss <= valid_loss_min:\n", - " print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,epoch_val_loss))\n", - " # torch.save(model.state_dict(), '../working/state_dict.pt')\n", - " valid_loss_min = epoch_val_loss\n", - " print(25*'==')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {} - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "fig = plt.figure(figsize = (20, 6))\n", - "plt.subplot(1, 2, 1)\n", - "plt.plot(epoch_tr_acc, label='Train Acc')\n", - "plt.plot(epoch_vl_acc, label='Validation Acc')\n", - "plt.title(\"Accuracy\")\n", - "plt.legend()\n", - "plt.grid()\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.plot(epoch_tr_loss, label='Train loss')\n", - "plt.plot(epoch_vl_loss, label='Validation loss')\n", - "plt.title(\"Loss\")\n", - "plt.legend()\n", - "plt.grid()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {} - }, - "source": [ - "---\n", - "# What's Next?\n", - "\n", - "You can use this project template as a starting point to think about your own project. There are a lot of ways to continue, here we share with you some ideas you migth find useful:\n", - "\n", - "* **Work on the Preproccesing.** We used a very rudimentary way to tokenize tweets. But there are better ways to preprocess the data. Can you think of a suitable way to preprocess the data for this particular task? How does the performance of the model change when the data is processed correctly?\n", - "* **Work on the Model.** The RNN model proposed in this notebook is not optimized at all. You can work on finding a better architecture or better hyperparamenters. May be using bidirectonal LSTMs or increasing the number of stacked layers can improve the performance, feel free to try different approaches.\n", - "* **Work on the Embedding.** Our model learnt an embedding during the training on this Twitter corpus for a particular task. You can explore the representation of different words in this learned embedding. Also, you can try using different word embeddings. You can train them on this corpus or you can use an embedding trained on another corpus of data. How does the change of the embedding affect the model performance?\n", - "* **Try sentiment analysis on another dataset.** There are lots of available dataset to work with, we can help you find one that is interesting to you. Do you belive that a sentiment analysis model trained on some corpus (Twitter dataset) will perform well on another type of data (for example, youtube comments)?\n", - "\n" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "include_colab_link": true, - "name": "sentiment_analysis", - "provenance": [], - "toc_visible": true - }, - "kernel": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} +{"cells":[{"cell_type":"markdown","metadata":{"execution":{},"id":"view-in-github"},"source":["\"Open   \"Open"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"D_fgc45VfjDz"},"source":["# Twitter Sentiment Analysis\n","\n","**By Neuromatch Academy**\n","\n","__Content creators:__ Juan Manuel Rodriguez, Salomey Osei, Gonzalo Uribarri\n","\n","__Production editors:__ Amita Kapoor, Spiros Chavlis"]},{"cell_type":"markdown","metadata":{"execution":{}},"source":["---\n","# Welcome to the NLP project template\n","\n",""]},{"cell_type":"markdown","metadata":{"execution":{}},"source":["---\n","# Step 1: Questions and goals\n","\n","* Can we infer emotion from a tweet text?\n","* How words are distributed accross the dataset?\n","* Are words related to one kind of emotion?"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"Vd1qdNW9fjD1"},"source":["---\n","# Step 2: Literature review\n","\n","[Original Dataset Paper](https://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf)\n","\n","[Papers with code](https://paperswithcode.com/dataset/imdb-movie-reviews)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"oOYDQElpfjD2"},"source":["---\n","# Step 3: Load and explore the dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":103706,"status":"ok","timestamp":1720042135196,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"EZpxSExUfjD2","outputId":"19b01445-9b83-4a93-9cc2-7830ab0dcf5b"},"outputs":[],"source":["# @title Install dependencies\n","!pip install pandas --quiet\n","!pip install torchtext --quiet\n","!pip install datasets --quiet"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":9008,"status":"ok","timestamp":1720042144200,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"DxqD3Tk5fjD3","outputId":"451d68c5-7894-4f93-9f54-bf0b7f482e20"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.10/dist-packages/torchtext/data/__init__.py:4: UserWarning: \n","/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n","Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n"," warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n"]}],"source":["# We import some libraries to load the dataset\n","import os\n","import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","\n","from datasets import load_dataset\n","\n","from collections import Counter\n","from tqdm.notebook import tqdm\n","\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","import torch.nn.functional as F\n","from torch.utils.data import TensorDataset, DataLoader\n","\n","import torchtext\n","from torchtext.data import get_tokenizer\n","\n","from sklearn.utils import shuffle\n","from sklearn.metrics import classification_report\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.model_selection import train_test_split\n","from sklearn.feature_extraction.text import CountVectorizer"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"63Eg1SLbfjD4"},"source":["You can find the dataset we are going to use in [this website](http://help.sentiment140.com/for-students/)."]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":567,"referenced_widgets":["fbb4191426bd485e8e965b6d432eecae","df7eba182d1b4c21bc21d157eac6b996","6d64402d9da74516ab4e1d46ae9f1ee3","d9ca809f7b1c49e595a05458251f3ab2","90908b6f69524a72860214ef8bd2d946","db432a2cd6244a7592fc9732f0ca4738","84485541f3a14c65a67d10a97b72bbad","5fa7ab2ab2004e5cb692199e2bd27d6b","ab71bd2b452146829e973d6cf99f31ed","55ba92cfe0724286ac1c2bbe6577e5c8","67a4fa49ca5349d58512a16a3742d401","afd671543846468abfe37669a72845c3","057e918ace004506aedc4e4b9942c3a8","325387f6b62d47b0b21bea61676cea72","ea1e3eb0e6ec4f8d82cf9b12cfe6e700","96c2d7ee644a438982e1792b7ec0453c","9baa1a735c0646b89953bf4a7c7fc92c","0ac9711c8ece4c5397a8cd810713adfb","a8d69769921241b8b1081e84f7770858","d189f24b0e964d1a9fc86379bad38cca","db9bf44dec914db793cc4f73751c272c","1cf3ba0f756f4aa5ad1dcb675a791cfa","c432c4efcb794ce781fcb6f176f1b60d","510eeffb32694e7798f23e3931d7a943","a8b3dfaa2831416582d8eeef01451386","db1cdafaf36f4c339476f3221abc17b3","ffd3778a96e046718828bbc5aa73f173","49c5a3fbe87b491cb3f0f450a0af0659","252949e8784c4878a62eb2e30b1e3466","7bcef602e7f441308472bc145b12dcd3","97fb30a5a31742efa1d188b9361f9938","9b34daddb9cc48bba109e547177ec654","fd2b5a6533794a2794579956c25247fb","f3a9667c8c994324a2409f227bd0a1e9","6e6c5372ffe045c0b72587989567429e","2ead0216695e4227aef44552f4ec3cc9","53843f49adda4bce8450fd91fa9fd587","40262cb3eefa45fcbe37aaafccb69f5f","b54b826314ea4b3a92eebd218c093fc1","8cd7be688b8c4818be48915db14a0792","a9a0f6ce71ed415c8c8513f68e34e162","7f638a6deacd42e88c031fa47797516b","849e39cc86f64e558ff94bf542a5121a","67b0b03c391c414bad5ea9fb3c947a2f","1cef38981af6457dbaeb393f9936a389","b0b5cfae51214c60bbca9a09b196c217","5ee2a4b33be04c6db8ee4d7995c2376d","403fffb635c2409ebeabc90063750ed3","6279343019064572adedf34cfbd437fa","2715d00db77545f9aa5eace8a0eb2839","942ce490d87347c789e229589b1b9c9f","f04df4daeb6049ab85d3d75b472ccf6e","fd0b3c53b66543cea0c396d8047445a8","2c42e2fef6314c9e842a7e9641af3cab","913d95e58aa94e4a8009768a23fbf304"]},"execution":{},"executionInfo":{"elapsed":189390,"status":"ok","timestamp":1720042333586,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"3HLOsd3rfjD4","outputId":"7653fee1-a871-472b-a978-d8ec0250dc84"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"fbb4191426bd485e8e965b6d432eecae","version_major":2,"version_minor":0},"text/plain":["Downloading builder script: 0%| | 0.00/4.03k [00:00\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
polarityuserdatequeryusertext
00_TheSpecialOne_Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
10scotthamiltonMon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
20mattycusMon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
30ElleCTFMon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
40KaroliMon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n"," \n"],"text/plain":[" polarity user date query \\\n","0 0 _TheSpecialOne_ Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n","1 0 scotthamilton Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n","2 0 mattycus Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n","3 0 ElleCTF Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n","4 0 Karoli Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n","\n"," user text \n","0 _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t... \n","1 scotthamilton is upset that he can't update his Facebook by ... \n","2 mattycus @Kenichan I dived many times for the ball. Man... \n","3 ElleCTF my whole body feels itchy and like its on fire \n","4 Karoli @nationwideclass no, it's not behaving at all.... "]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["# We load the dataset\n","\n","dataset = load_dataset(\"stanfordnlp/sentiment140\", trust_remote_code= True)\n","\n","train_data = dataset[\"train\"]\n","df = pd.DataFrame(train_data)\n","df = df.rename(columns={'sentiment': 'polarity'})\n","df = df[['polarity', 'user', 'date', 'query', 'user', 'text']]\n","df.head()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"fuKShcfjfjD4"},"source":["For this project we will use only the text and the polarity of the tweet. Notice that polarity is 0 for negative tweets and 4 for positive tweet."]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{},"executionInfo":{"elapsed":1059,"status":"ok","timestamp":1720042334642,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"GXHQOn6gfjD5"},"outputs":[],"source":["X = df.text.values\n","\n","# Changes values from [0,4] to [0,1]\n","y = (df.polarity.values > 1).astype(int)\n","\n","\n","# Split the data into train and test\n","x_train_text, x_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"7kr3TO_LfjD5"},"source":["The first thing we have to do before working on the models is to familiarize ourselves with the dataset. This is called Exploratory Data Analisys (EDA)."]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":12,"status":"ok","timestamp":1720042334642,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"FsL-xY03fjD5","outputId":"655f0ef8-c177-4f42-c024-1d628241401a"},"outputs":[{"name":"stdout","output_type":"stream","text":["1: @paisleypaisley LOL why do i get ideas so far in advance? it's not even june yet! we need a third knitter to have our own summer group \n","0: worst headache ever \n","0: @ewaniesciuszko i am so sad i wont see you! I miss you already. and yeah! that's perfect; i come back the 18th!\n","1: doesn't know how to spell conked \n","0: "So we stand here now and no one knows us at all I won't get used to this I won't get used to being gone"...I miss home and everyone -a\n"]}],"source":["for s, l in zip(x_train_text[:5], y_train[:5]):\n"," print('{}: {}'.format(l, s))"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"4cPGXSc-fjD5"},"source":["An interesting thing to analyze is the Word Distribution. In order to count the occurrences of each word, we should tokenize the sentences first."]},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":9,"status":"ok","timestamp":1720042334642,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"U1OugpZ0fjD5","outputId":"9e6cb4e3-8d8c-4db0-c113-bdd4fe87db5f"},"outputs":[{"name":"stdout","output_type":"stream","text":["Before Tokenize: worst headache ever \n","After Tokenize: ['worst', 'headache', 'ever']\n"]}],"source":["tokenizer = get_tokenizer(\"basic_english\")\n","\n","print('Before Tokenize: ', x_train_text[1])\n","print('After Tokenize: ', tokenizer(x_train_text[1]))"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":81,"referenced_widgets":["e1348a02ceeb4af19fbd63d52b7d843b","fbf51b14e6b34d0485ddf59c43d22c49","c29e06a72ac9401b8c41f4195021071e","48b812211db04284bfbbf02823fb879a","5455119809c74916acc50e1905903ded","2475bd62a3224bacb38a6334d07d6a8c","3d29947b5d2d4e2abc1355d900096642","3f7a8f56f15c434da70029366a37167a","3610a2db297f4686bf9043f2b7ee55b5","a1bd0616199e44538977ee2ea6049690","835fb9a91b34471fa6d61adf37616f52","d8de1a85076b453a92295e79110ba8fd","78d48ee2fb9f42089f475fcf5fc368c8","b0ca3012d0b84c5a9d7c1fc176251af7","39fa73efcbf54d8dad225d8380061dbf","6b6cc35257fe433e93736d02e898b6b8","e0fc900d8b5940a6bd6a97e58adb4651","6b7286d74e0f4a0199dbfcaf3dd0d622","a4bbd3df99cd4acab5e1b3ba5cd7c114","9a7140a6197945d5bac5c48b820dfb04","0bdc146792a64853ae06a9d185aa2b15","768da964ffcd44fea1af09e81f5621f3"]},"execution":{},"executionInfo":{"elapsed":29122,"status":"ok","timestamp":1720042363757,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"7ZggzGCXfjD6","outputId":"ae19f8d6-224d-4224-d3a0-d00c659ec9b2"},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"e1348a02ceeb4af19fbd63d52b7d843b","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/1280000 [00:00"]},"metadata":{},"output_type":"display_data"}],"source":["plt.bar(range(100), [words[w] for w in sorted_words[:100]])\n","plt.show()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"o9IYA0cZfjD7"},"source":["It is very common to find this kind of distribution when analyzing corpus of text. This is referred to as the [zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law)."]},{"cell_type":"markdown","metadata":{"execution":{},"id":"5FQIOqoRfjD7"},"source":["Usually the number of words in the dictionary will be very large.\n","\n","Here are some thing we can do to reduce that number:\n","\n","* Remove puntuation.\n","* Remove stop-words.\n","* Steaming.\n","* Remove very uncommon words (the words that appears in fewer than N occations).\n","* Nothing: we can use a pretrain model that handles this kind of situations.\n","\n","\n","We used one of the simplest tokenizers availables. This tokenizer does not take into account many quirks of the language. Moreover, diferent languages have different quirks, so there is no \"universal\" tokenizers. There are many libraries that have \"better\" tokenizers:\n","\n","* [Spacy](https://spacy.io/): it can be accessed using: `get_tokenizer(\"spacy\")`. Spacy supports a wide range of languages.\n","* [Huggingface](https://huggingface.co/): it has many tokenizers for different laguages. [Doc](https://huggingface.co/transformers/main_classes/tokenizer.html)\n","* [NLTK](https://www.nltk.org/): it provides several tokenizers. One of them can be accessed using: `get_tokenizer(\"toktok\")`\n"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"_ul5MgYcfjD7"},"source":["---\n","# Step 4: choose toolkit\n","\n","Our goal is to train a model capable of estimating the sentiment of a tweet (positive or negative) by reading its content. To that end we will try 2 different approaches:\n","\n","* A logistic regression using sklearn. **NOTE**: it can probaly work better than an SVM model.\n","* A simple Embedding + RNN."]},{"cell_type":"markdown","metadata":{"execution":{},"id":"GteI1PxTfjD7"},"source":["## Logistic regression\n","\n","We will represent our senteces using binary vectorization. This means that our data would be represented as a matrix of instances by word with a one if the word is in the instance, and zero otherwise. Sklean vectorizers can also do things such as stop-word removal and puntuation removal, you can read more about in [the documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)."]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{},"executionInfo":{"elapsed":22699,"status":"ok","timestamp":1720042396408,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"S_ei2qu8fjD7"},"outputs":[],"source":["vectorizer = CountVectorizer(binary=True)\n","x_train_cv = vectorizer.fit_transform(x_train_text)\n","x_test_cv = vectorizer.transform(x_test_text)"]},{"cell_type":"code","execution_count":12,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1720042396409,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"iK_zfqnLfjD7","outputId":"9b3f6db3-01bf-4246-b943-359620c717a2"},"outputs":[{"name":"stdout","output_type":"stream","text":["Before Vectorize: doesn't know how to spell conked \n"]}],"source":["print('Before Vectorize: ', x_train_text[3])"]},{"cell_type":"code","execution_count":13,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720042396409,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"wKxY8e38fjD8","outputId":"19530135-070d-4259-d6a9-7ba06b519763"},"outputs":[{"name":"stdout","output_type":"stream","text":["After Vectorize: \n"," (0, 528584)\t1\n"," (0, 165468)\t1\n"," (0, 300381)\t1\n"," (0, 242211)\t1\n"," (0, 489893)\t1\n"," (0, 134160)\t1\n"]}],"source":["# Notice that the matriz is sparse\n","print('After Vectorize: ')\n","print(x_train_cv[3])"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"QTPPEMd9fjD8"},"source":["Now we can train our model. You can check the documentation of this logistic regressor [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic#sklearn.linear_model.LogisticRegression)."]},{"cell_type":"code","execution_count":14,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":74},"execution":{},"executionInfo":{"elapsed":127277,"status":"ok","timestamp":1720042523682,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"2vEPOQS6fjD8","outputId":"3be77fc0-76e6-40b8-8847-5f6e7c6c0ce0"},"outputs":[{"data":{"text/html":["
LogisticRegression(solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"],"text/plain":["LogisticRegression(solver='saga')"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["model = LogisticRegression(solver='saga')\n","model.fit(x_train_cv, y_train)"]},{"cell_type":"code","execution_count":15,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":7,"status":"ok","timestamp":1720042523683,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"37bUbqB6fjD8","outputId":"7eb9178d-6130-47d0-bdf4-ce4be164bc97"},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.81 0.79 0.80 160000\n"," 1 0.79 0.81 0.80 160000\n","\n"," accuracy 0.80 320000\n"," macro avg 0.80 0.80 0.80 320000\n","weighted avg 0.80 0.80 0.80 320000\n","\n"]}],"source":["y_pred = model.predict(x_test_cv)\n","\n","print(classification_report(y_test, y_pred))"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"161kDLhofjD8"},"source":["## Explainable AI\n","The best thing about logistic regresion is that it is simple, and we can get some explanations."]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":1105,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"EILTmxzifjD9","outputId":"b7ce6853-7385-4a24-d4eb-e6d0843ca5d5"},"outputs":[{"name":"stdout","output_type":"stream","text":["(1, 589260)\n","589260\n"]}],"source":["print(model.coef_.shape)\n","print(len(vectorizer.vocabulary_))\n","\n","words_sk = list(vectorizer.vocabulary_.keys())\n","words_sk.sort(key=lambda w: model.coef_[0, vectorizer.vocabulary_[w]])"]},{"cell_type":"code","execution_count":17,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":12,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"NGjVPON6fjD9","outputId":"d40443bc-476d-4f5a-ce90-4b5b17e47933"},"outputs":[{"name":"stdout","output_type":"stream","text":["roni: -3.8625952420933984\n","inaperfectworld: -3.5734321547933936\n","dontyouhate: -3.5002133484207576\n","xbllygbsn: -3.4126303898325787\n","anqju: -3.3363997631497493\n","sad: -3.200516823534637\n","pakcricket: -3.1949062976331675\n","condolences: -3.132503698316079\n","heartbreaking: -3.0665219866881297\n","saddest: -3.042020604188048\n","sadd: -3.029036146667248\n","heartbroken: -3.0287524416643463\n","boohoo: -3.0226033087262802\n","sadface: -2.991829110065316\n","rachelle_lefevr: -2.925076661509848\n","disappointing: -2.902522686643491\n","lvbu: -2.8947109582208865\n","saddens: -2.8855187276040715\n","bummed: -2.836500453805889\n","neda: -2.792917726280752\n"]}],"source":["for w in words_sk[:20]:\n"," print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))"]},{"cell_type":"code","execution_count":18,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":10,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"oxQ_jlNRfjD9","outputId":"363de58b-817a-4205-f019-2379d0d64e0d"},"outputs":[{"name":"stdout","output_type":"stream","text":["iamsoannoyed: 2.8493838469077013\n","myfax: 2.7974330510971424\n","jennamadison: 2.5667217237933104\n","yeyy: 2.4780234846131646\n","tryout: 2.438315611477797\n","goldymom: 2.4374072779309204\n","wooohooo: 2.402957513257194\n","thesupergirl: 2.356525094856456\n","iammaxathotspot: 2.3116551216589682\n","londicreations: 2.3074264075299316\n","smilin: 2.2991796213822497\n","worries: 2.2899555142510084\n","sinfulsignorita: 2.27989578448778\n","finchensnail: 2.2642827277181063\n","smackthis: 2.237672991997692\n","kv: 2.2157591386122775\n","tojosan: 2.2117938132889696\n","russmarshalek: 2.20953890861265\n","traciknoppe: 2.1768232307222153\n","congratulations: 2.1715901103136876\n"]}],"source":["for w in reversed(words_sk[-20:]):\n"," print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"9KSSAC3qfjD9"},"source":["What does this mean?\n","\n","Remember the `model.coef_` is the $W$ in:\n","\n","$$h(x)=\\sigma(WX + b)$$\n","\n","where the label 1 is a positive tweet and the label 0 is a negative tweet."]},{"cell_type":"markdown","metadata":{"execution":{},"id":"oDHjTP2_fjD9"},"source":["## Recurrent Neural Network with Pytorch"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"TbgpKy95fjD9"},"source":["In the previous section we use a Bag-Of-Words approach to represent each of the tweets. That meas that we only consider how many times each of the words appear in each of the tweets, we didnt take into account the order of the words. But we know that the word order is very important and carries relevant information.\n","\n","In this section we will solve the same task, but this time we will implement a Recurrent Neural Network (RNN) instead of using a simple Logistic Regression.Unlike feedforward neural networks, RNNs have cyclic connections making them powerful for modeling sequences.\n","\n","Let's start by importing the relevant libraries.\n"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"7nmUJV99fjEB"},"outputs":[],"source":["def set_device():\n"," device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n"," if device != \"cuda\":\n"," print(\"WARNING: For this notebook to perform best, \"\n"," \"if possible, in the menu under `Runtime` -> \"\n"," \"`Change runtime type.` select `GPU` \")\n"," else:\n"," print(\"GPU is enabled in this notebook.\")\n","\n"," return device"]},{"cell_type":"code","execution_count":20,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":7,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"chI-18LcfjEB","outputId":"7f633079-6548-48f3-802e-94bc9cfada93"},"outputs":[{"name":"stdout","output_type":"stream","text":["GPU is enabled in this notebook.\n"]}],"source":["# Set the device (check if gpu is available)\n","device = set_device()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"01UtIN7ofjEC"},"source":["First we will create a Dictionary (`word_to_idx`). This dictionary will map each Token (usually words) to an index (an integer number). We want to limit our dictionary to a certain number of tokens (`num_words_dict`), so we will include in our ditionary those with more occurrences."]},{"cell_type":"code","execution_count":21,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"afus9SyUfjEC","outputId":"bb4eb869-e2f0-4ccd-f64c-e55908272345"},"outputs":[{"data":{"text/plain":["['.', 'i', '!', \"'\", 'to', 'the', ',', 'a', 'my', 'it']"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["# From previous section, we have a list with the most used tokens\n","sorted_words[:10]"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"6vfQFjaufjEC"},"source":["Let's select only the most used."]},{"cell_type":"code","execution_count":22,"metadata":{"execution":{},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720042524785,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"tGLkxaGcfjEC"},"outputs":[],"source":["num_words_dict = 30000\n","# We reserve two numbers for special tokens.\n","most_used_words = sorted_words[:num_words_dict-2]"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"AzhQvekCfjEC"},"source":["We will add two extra Tokens to the dictionary, one for words outside the dictionary (`'UNK'`) and one for padding the sequences (`'PAD'`)."]},{"cell_type":"code","execution_count":23,"metadata":{"execution":{},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1720042524785,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"73Wrb-lEfjEC"},"outputs":[],"source":["# dictionary to go from words to idx\n","word_to_idx = {}\n","# dictionary to go from idx to words (just in case)\n","idx_to_word = {}\n","\n","\n","# We include the special tokens first\n","PAD_token = 0\n","UNK_token = 1\n","\n","word_to_idx['PAD'] = PAD_token\n","word_to_idx['UNK'] = UNK_token\n","\n","idx_to_word[PAD_token] = 'PAD'\n","idx_to_word[UNK_token] = 'UNK'\n","\n","# We popullate our dictionaries with the most used words\n","for num,word in enumerate(most_used_words):\n"," word_to_idx[word] = num + 2\n"," idx_to_word[num+2] = word"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"kMHVkEisfjEC"},"source":["Our goal now is to transform each tweet from a sequence of tokens to a sequence of indexes. These sequences of indexes will be the input to our pytorch model."]},{"cell_type":"code","execution_count":24,"metadata":{"execution":{},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1720042524785,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"tkCIu3PKfjED"},"outputs":[],"source":["# A function to convert list of tokens to list of indexes\n","def tokens_to_idx(sentences_tokens,word_to_idx):\n"," sentences_idx = []\n"," for sent in sentences_tokens:\n"," sent_idx = []\n"," for word in sent:\n"," if word in word_to_idx:\n"," sent_idx.append(word_to_idx[word])\n"," else:\n"," sent_idx.append(word_to_idx['UNK'])\n"," sentences_idx.append(sent_idx)\n"," return sentences_idx"]},{"cell_type":"code","execution_count":25,"metadata":{"execution":{},"executionInfo":{"elapsed":9346,"status":"ok","timestamp":1720042534127,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"aHru4vpzfjED"},"outputs":[],"source":["x_train_idx = tokens_to_idx(x_train_token,word_to_idx)\n","x_test_idx = tokens_to_idx(x_test_token,word_to_idx)"]},{"cell_type":"code","execution_count":26,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1720042534127,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"Ofj3OD7zfjED","outputId":"b2788d03-dbfa-41d7-8231-5011206baa59"},"outputs":[{"name":"stdout","output_type":"stream","text":["Before converting: ['worst', 'headache', 'ever']\n","After converting: [721, 458, 237]\n"]}],"source":["some_number = 1\n","print('Before converting: ', x_train_token[some_number])\n","print('After converting: ', x_train_idx[some_number])"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"NcCicvb-fjED"},"source":["We need all the sequences to have the same length. To select an adequate sequence length, let's explore some statistics about the length of the tweets:"]},{"cell_type":"code","execution_count":27,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1720042534128,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"BSjhdyYUfjED","outputId":"82e49be9-7868-44ac-b496-c7a48da1efee"},"outputs":[{"name":"stdout","output_type":"stream","text":["Max tweet word length: 229\n","Mean tweet word length: 15.0\n","99% percent under: 37.0\n"]}],"source":["tweet_lens = np.asarray([len(sentence) for sentence in x_train_idx])\n","print('Max tweet word length: ',tweet_lens.max())\n","print('Mean tweet word length: ',np.median(tweet_lens))\n","print('99% percent under: ',np.quantile(tweet_lens,0.99))"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"t311WY6ZfjEE"},"source":["We cut the sequences which are larger than our chosen maximum length (`max_lenght`) and fill with zeros the ones that are shorter."]},{"cell_type":"code","execution_count":28,"metadata":{"execution":{},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720042534128,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"r4S8KTWLfjEE"},"outputs":[],"source":[" # We choose the max length\n"," max_length = 40\n","\n","# A function to make all the sequence have the same lenght\n","# Note that the output is a Numpy matrix\n"," def padding(sentences, seq_len):\n"," features = np.zeros((len(sentences), seq_len),dtype=int)\n"," for ii, tweet in enumerate(sentences):\n"," len_tweet = len(tweet)\n"," if len_tweet != 0:\n"," if len_tweet <= seq_len:\n"," # If its shorter, we fill with zeros (the padding Token index)\n"," features[ii, -len(tweet):] = np.array(tweet)[:seq_len]\n"," if len_tweet > seq_len:\n"," # If its larger, we take the last 'seq_len' indexes\n"," features[ii, :] = np.array(tweet)[-seq_len:]\n"," return features"]},{"cell_type":"code","execution_count":29,"metadata":{"execution":{},"executionInfo":{"elapsed":4762,"status":"ok","timestamp":1720042538886,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"Z-Cw-bBxfjEE"},"outputs":[],"source":["# We convert our list of tokens into a numpy matrix\n","# where all instances have the same lenght\n","x_train_pad = padding(x_train_idx,max_length)\n","x_test_pad = padding(x_test_idx,max_length)\n","\n","# We convert our target list a numpy matrix\n","y_train_np = np.asarray(y_train)\n","y_test_np = np.asarray(y_test)"]},{"cell_type":"code","execution_count":30,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":12,"status":"ok","timestamp":1720042538886,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"8eC3YswlfjEE","outputId":"3bb0ea7f-518f-4545-9241-feb783f48122"},"outputs":[{"name":"stdout","output_type":"stream","text":["Before padding: [1, 3, 71, 24, 122, 3, 533, 74, 13, 4, 3, 102, 13, 209, 2, 12, 150, 4, 22, 5, 18, 667, 3, 138, 61, 7, 3296, 4]\n","After padding: [ 0 0 0 0 0 0 0 0 0 0 0 0 1 3\n"," 71 24 122 3 533 74 13 4 3 102 13 209 2 12\n"," 150 4 22 5 18 667 3 138 61 7 3296 4]\n"]}],"source":["some_number = 2\n","print('Before padding: ', x_train_idx[some_number])\n","print('After padding: ', x_train_pad[some_number])"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"SzDhnauUfjEE"},"source":["Now, let's convert the data to pytorch format.\n"]},{"cell_type":"code","execution_count":31,"metadata":{"execution":{},"executionInfo":{"elapsed":10,"status":"ok","timestamp":1720042538886,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"--Yd22YWfjEF"},"outputs":[],"source":["# create Tensor datasets\n","train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train_np))\n","valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test_np))\n","\n","# Batch size (this is an important hyperparameter)\n","batch_size = 100\n","\n","# dataloaders\n","# make sure to SHUFFLE your data\n","train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last = True)\n","valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last = True)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"jQ5qPOWTfjEF"},"source":["Each batch of data in our traning proccess will have the folllowing format:"]},{"cell_type":"code","execution_count":33,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":598,"status":"ok","timestamp":1720042563992,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"S1mqhk1hfjEF","outputId":"c97e7edd-695f-4336-a2e6-f6bed4852a63"},"outputs":[{"name":"stdout","output_type":"stream","text":["Sample input size: torch.Size([100, 40])\n","Sample input: \n"," tensor([[ 0, 0, 0, ..., 32, 203, 86],\n"," [ 0, 0, 0, ..., 1, 1, 4661],\n"," [ 0, 0, 0, ..., 169, 43, 34],\n"," ...,\n"," [ 0, 0, 0, ..., 2, 2961, 4076],\n"," [ 0, 0, 0, ..., 2319, 1325, 2],\n"," [ 0, 0, 0, ..., 7, 253, 1]])\n","Sample input: \n"," tensor([0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,\n"," 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,\n"," 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,\n"," 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,\n"," 0, 1, 0, 1])\n"]}],"source":["# Obtain one batch of training data\n","dataiter = iter(train_loader)\n","sample_x, sample_y = dataiter.__next__()\n","\n","print('Sample input size: ', sample_x.size()) # batch_size, seq_length\n","print('Sample input: \\n', sample_x)\n","print('Sample input: \\n', sample_y)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"jn0PzZdGfjEF"},"source":["Now, we will define the `SentimentRNN` class. Most of the model's class will be familiar to you, but there are two important layers we would like you to pay attention to:\n","\n","* Embedding Layer\n","> This layer is like a linear layer, but it makes it posible to use a sequence of inedexes as inputs (instead of a sequence of one-hot-encoded vectors). During training, the Embedding layer learns a linear transformation from the space of words (a vector space of dimension `num_words_dict`) into the a new, smaller, vector space of dimension `embedding_dim`. We suggest you to read this [thread](https://discuss.pytorch.org/t/how-does-nn-embedding-work/88518/3) and the [pytorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) if you want to learn more about this particular kind of layers.\n","\n","\n","* LSTM layer\n","> This is one of the most used class of Recurrent Neural Networks. In Pytorch we can add several stacked layers in just one line of code. In our case, the number of layers added are decided with the parameter `no_layers`. If you want to learn more about LSTMs we strongly recommend you this [Colahs thread](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) about them.\n","\n","\n","\n","\n","\n"]},{"cell_type":"code","execution_count":34,"metadata":{"execution":{},"executionInfo":{"elapsed":433,"status":"ok","timestamp":1720042567199,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"vfzcowAxfjEF"},"outputs":[],"source":["class SentimentRNN(nn.Module):\n"," def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.1):\n"," super(SentimentRNN,self).__init__()\n","\n"," self.output_dim = output_dim\n"," self.hidden_dim = hidden_dim\n"," self.no_layers = no_layers\n"," self.vocab_size = vocab_size\n"," self.drop_prob = drop_prob\n","\n"," # Embedding Layer\n"," self.embedding = nn.Embedding(vocab_size, embedding_dim)\n","\n"," # LSTM Layers\n"," self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,\n"," num_layers=no_layers, batch_first=True,\n"," dropout=self.drop_prob)\n","\n"," # Dropout layer\n"," self.dropout = nn.Dropout(drop_prob)\n","\n"," # Linear and Sigmoid layer\n"," self.fc = nn.Linear(self.hidden_dim, output_dim)\n"," self.sig = nn.Sigmoid()\n","\n"," def forward(self,x,hidden):\n"," batch_size = x.size(0)\n","\n"," # Embedding out\n"," embeds = self.embedding(x)\n"," #Shape: [batch_size x max_length x embedding_dim]\n","\n"," # LSTM out\n"," lstm_out, hidden = self.lstm(embeds, hidden)\n"," # Shape: [batch_size x max_length x hidden_dim]\n","\n"," # Select the activation of the last Hidden Layer\n"," lstm_out = lstm_out[:,-1,:].contiguous()\n"," # Shape: [batch_size x hidden_dim]\n","\n"," ## You can instead average the activations across all the times\n"," # lstm_out = torch.mean(lstm_out, 1).contiguous()\n","\n"," # Dropout and Fully connected layer\n"," out = self.dropout(lstm_out)\n"," out = self.fc(out)\n","\n"," # Sigmoid function\n"," sig_out = self.sig(out)\n","\n"," # return last sigmoid output and hidden state\n"," return sig_out, hidden\n","\n"," def init_hidden(self, batch_size):\n"," ''' Initializes hidden state '''\n"," # Create two new tensors with sizes n_layers x batch_size x hidden_dim,\n"," # initialized to zero, for hidden state and cell state of LSTM\n"," h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n"," c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n"," hidden = (h0,c0)\n"," return hidden"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"YfrLPa9mfjEF"},"source":["We choose the parameters of the model."]},{"cell_type":"code","execution_count":35,"metadata":{"execution":{},"executionInfo":{"elapsed":471,"status":"ok","timestamp":1720042569608,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"rOm-xoFkfjEG"},"outputs":[],"source":["# Parameters of our network\n","\n","# Size of our vocabulary\n","vocab_size = num_words_dict\n","\n","# Embedding dimension\n","embedding_dim = 32\n","\n","# Number of stacked LSTM layers\n","no_layers = 2\n","\n","# Dimension of the hidden layer in LSTMs\n","hidden_dim = 64\n","\n","# Dropout parameter for regularization\n","output_dim = 1\n","\n","# Dropout parameter for regularization\n","drop_prob = 0.25"]},{"cell_type":"code","execution_count":36,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":465,"status":"ok","timestamp":1720042571776,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"xapxpe7ufjEG","outputId":"51c90159-7d2b-4fc4-f34a-98e9901d40e4"},"outputs":[{"name":"stdout","output_type":"stream","text":["SentimentRNN(\n"," (embedding): Embedding(30000, 32)\n"," (lstm): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.25)\n"," (dropout): Dropout(p=0.25, inplace=False)\n"," (fc): Linear(in_features=64, out_features=1, bias=True)\n"," (sig): Sigmoid()\n",")\n"]}],"source":["# Let's define our model\n","model = SentimentRNN(no_layers, vocab_size, hidden_dim,\n"," embedding_dim, drop_prob=drop_prob)\n","# Moving to gpu\n","model.to(device)\n","print(model)"]},{"cell_type":"code","execution_count":37,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720042571776,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"GEvTs3uwfjEG","outputId":"2e15f6df-2aa8-4665-b2da-7363d2bfa09e"},"outputs":[{"name":"stdout","output_type":"stream","text":["Total Number of parameters: 1018433\n"]}],"source":["# How many trainable parameters does our model have?\n","model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n","params = sum([np.prod(p.size()) for p in model_parameters])\n","print('Total Number of parameters: ',params)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"Pc2OC5GDfjEG"},"source":["We choose the losses and the optimizer for the training procces."]},{"cell_type":"code","execution_count":38,"metadata":{"execution":{},"executionInfo":{"elapsed":1740,"status":"ok","timestamp":1720042574210,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"iBWjPADUfjEG"},"outputs":[],"source":["# loss and optimization functions\n","lr = 0.001\n","\n","# Binary crossentropy is a good loss function for a binary classification problem\n","criterion = nn.BCELoss()\n","\n","# We choose an Adam optimizer\n","optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n","\n","# function to predict accuracy\n","def acc(pred,label):\n"," pred = torch.round(pred.squeeze())\n"," return torch.sum(pred == label.squeeze()).item()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"OZgMwOe2fjEG"},"source":["We are ready to train our model."]},{"cell_type":"code","execution_count":39,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":304614,"status":"ok","timestamp":1720042880244,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"3B6YhEocfjEH","outputId":"76276a1f-7775-4b98-aab0-0e199aa133e4"},"outputs":[{"name":"stdout","output_type":"stream","text":["Epoch 1\n","train_loss : 0.4366412344621494 val_loss : 0.3881208170717582\n","train_accuracy : 79.485546875 val_accuracy : 82.475\n","Validation loss decreased (inf --> 0.388121). Saving model ...\n","==================================================\n","Epoch 2\n","train_loss : 0.3760281792609021 val_loss : 0.3713956154882908\n","train_accuracy : 83.186328125 val_accuracy : 83.4575\n","Validation loss decreased (0.388121 --> 0.371396). Saving model ...\n","==================================================\n","Epoch 3\n","train_loss : 0.3574051411205437 val_loss : 0.36425656544510276\n","train_accuracy : 84.19953125 val_accuracy : 83.80375\n","Validation loss decreased (0.371396 --> 0.364257). Saving model ...\n","==================================================\n","Epoch 4\n","train_loss : 0.344456663565943 val_loss : 0.3613302929420024\n","train_accuracy : 84.89265625 val_accuracy : 84.00874999999999\n","Validation loss decreased (0.364257 --> 0.361330). Saving model ...\n","==================================================\n","Epoch 5\n","train_loss : 0.33407817618339325 val_loss : 0.3601334386831149\n","train_accuracy : 85.444921875 val_accuracy : 84.03625\n","Validation loss decreased (0.361330 --> 0.360133). Saving model ...\n","==================================================\n"]}],"source":["# Number of training Epochs\n","epochs = 5\n","\n","# Maximum absolute value accepted for the gradeint\n","clip = 5\n","\n","# Initial Loss value (assumed big)\n","valid_loss_min = np.Inf\n","\n","# Lists to follow the evolution of the loss and accuracy\n","epoch_tr_loss,epoch_vl_loss = [],[]\n","epoch_tr_acc,epoch_vl_acc = [],[]\n","\n","# Train for a number of Epochs\n","for epoch in range(epochs):\n"," train_losses = []\n"," train_acc = 0.0\n"," model.train()\n","\n"," for inputs, labels in train_loader:\n","\n"," # Initialize hidden state\n"," h = model.init_hidden(batch_size)\n"," # Creating new variables for the hidden state\n"," h = tuple([each.data.to(device) for each in h])\n","\n"," # Move batch inputs and labels to gpu\n"," inputs, labels = inputs.to(device), labels.to(device)\n","\n"," # Set gradient to zero\n"," model.zero_grad()\n","\n"," # Compute model output\n"," output,h = model(inputs,h)\n","\n"," # Calculate the loss and perform backprop\n"," loss = criterion(output.squeeze(), labels.float())\n"," loss.backward()\n"," train_losses.append(loss.item())\n","\n"," # calculating accuracy\n"," accuracy = acc(output,labels)\n"," train_acc += accuracy\n","\n"," #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.\n"," nn.utils.clip_grad_norm_(model.parameters(), clip)\n"," optimizer.step()\n","\n","\n"," # Evaluate on the validation set for this epoch\n"," val_losses = []\n"," val_acc = 0.0\n"," model.eval()\n"," for inputs, labels in valid_loader:\n","\n"," # Initialize hidden state\n"," val_h = model.init_hidden(batch_size)\n"," val_h = tuple([each.data.to(device) for each in val_h])\n","\n"," # Move batch inputs and labels to gpu\n"," inputs, labels = inputs.to(device), labels.to(device)\n","\n"," # Compute model output\n"," output, val_h = model(inputs, val_h)\n","\n"," # Compute Loss\n"," val_loss = criterion(output.squeeze(), labels.float())\n","\n"," val_losses.append(val_loss.item())\n","\n"," accuracy = acc(output,labels)\n"," val_acc += accuracy\n","\n"," epoch_train_loss = np.mean(train_losses)\n"," epoch_val_loss = np.mean(val_losses)\n"," epoch_train_acc = train_acc/len(train_loader.dataset)\n"," epoch_val_acc = val_acc/len(valid_loader.dataset)\n"," epoch_tr_loss.append(epoch_train_loss)\n"," epoch_vl_loss.append(epoch_val_loss)\n"," epoch_tr_acc.append(epoch_train_acc)\n"," epoch_vl_acc.append(epoch_val_acc)\n"," print(f'Epoch {epoch+1}')\n"," print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')\n"," print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')\n"," if epoch_val_loss <= valid_loss_min:\n"," print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,epoch_val_loss))\n"," # torch.save(model.state_dict(), '../working/state_dict.pt')\n"," valid_loss_min = epoch_val_loss\n"," print(25*'==')"]},{"cell_type":"code","execution_count":40,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":364},"execution":{},"executionInfo":{"elapsed":894,"status":"ok","timestamp":1720042881135,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"ttJazP-nfjEH","outputId":"992bed02-611e-4614-c60f-77223d5b801a"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["fig = plt.figure(figsize = (20, 6))\n","plt.subplot(1, 2, 1)\n","plt.plot(epoch_tr_acc, label='Train Acc')\n","plt.plot(epoch_vl_acc, label='Validation Acc')\n","plt.title(\"Accuracy\")\n","plt.legend()\n","plt.grid()\n","\n","plt.subplot(1, 2, 2)\n","plt.plot(epoch_tr_loss, label='Train loss')\n","plt.plot(epoch_vl_loss, label='Validation loss')\n","plt.title(\"Loss\")\n","plt.legend()\n","plt.grid()\n","\n","plt.show()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"iUyaF-EbfjEH"},"source":["---\n","# What's Next?\n","\n","You can use this project template as a starting point to think about your own project. There are a lot of ways to continue, here we share with you some ideas you migth find useful:\n","\n","* **Work on the Preproccesing.** We used a very rudimentary way to tokenize tweets. But there are better ways to preprocess the data. Can you think of a suitable way to preprocess the data for this particular task? How does the performance of the model change when the data is processed correctly?\n","* **Work on the Model.** The RNN model proposed in this notebook is not optimized at all. You can work on finding a better architecture or better hyperparamenters. May be using bidirectonal LSTMs or increasing the number of stacked layers can improve the performance, feel free to try different approaches.\n","* **Work on the Embedding.** Our model learnt an embedding during the training on this Twitter corpus for a particular task. You can explore the representation of different words in this learned embedding. Also, you can try using different word embeddings. You can train them on this corpus or you can use an embedding trained on another corpus of data. How does the change of the embedding affect the model performance?\n","* **Try sentiment analysis on another dataset.** There are lots of available dataset to work with, we can help you find one that is interesting to you. Do you belive that a sentiment analysis model trained on some corpus (Twitter dataset) will perform well on another type of data (for example, youtube comments)?\n","\n"]}],"metadata":{"accelerator":"GPU","colab":{"provenance":[],"toc_visible":true},"kernel":{"display_name":"Python 3","language":"python","name":"python3"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.2"},"widgets":{"application/vnd.jupyter.widget-state+json":{"057e918ace004506aedc4e4b9942c3a8":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9baa1a735c0646b89953bf4a7c7fc92c","placeholder":"​","style":"IPY_MODEL_0ac9711c8ece4c5397a8cd810713adfb","value":"Downloading readme: 100%"}},"0ac9711c8ece4c5397a8cd810713adfb":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"0bdc146792a64853ae06a9d185aa2b15":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1cef38981af6457dbaeb393f9936a389":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_b0b5cfae51214c60bbca9a09b196c217","IPY_MODEL_5ee2a4b33be04c6db8ee4d7995c2376d","IPY_MODEL_403fffb635c2409ebeabc90063750ed3"],"layout":"IPY_MODEL_6279343019064572adedf34cfbd437fa"}},"1cf3ba0f756f4aa5ad1dcb675a791cfa":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2475bd62a3224bacb38a6334d07d6a8c":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"252949e8784c4878a62eb2e30b1e3466":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2715d00db77545f9aa5eace8a0eb2839":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2c42e2fef6314c9e842a7e9641af3cab":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2ead0216695e4227aef44552f4ec3cc9":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a9a0f6ce71ed415c8c8513f68e34e162","max":1600000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_7f638a6deacd42e88c031fa47797516b","value":1600000}},"325387f6b62d47b0b21bea61676cea72":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a8d69769921241b8b1081e84f7770858","max":6837,"min":0,"orientation":"horizontal","style":"IPY_MODEL_d189f24b0e964d1a9fc86379bad38cca","value":6837}},"3610a2db297f4686bf9043f2b7ee55b5":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"39fa73efcbf54d8dad225d8380061dbf":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0bdc146792a64853ae06a9d185aa2b15","placeholder":"​","style":"IPY_MODEL_768da964ffcd44fea1af09e81f5621f3","value":" 320000/320000 [00:06<00:00, 58691.43it/s]"}},"3d29947b5d2d4e2abc1355d900096642":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3f7a8f56f15c434da70029366a37167a":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"40262cb3eefa45fcbe37aaafccb69f5f":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"403fffb635c2409ebeabc90063750ed3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2c42e2fef6314c9e842a7e9641af3cab","placeholder":"​","style":"IPY_MODEL_913d95e58aa94e4a8009768a23fbf304","value":" 498/498 [00:00<00:00, 7393.07 examples/s]"}},"48b812211db04284bfbbf02823fb879a":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a1bd0616199e44538977ee2ea6049690","placeholder":"​","style":"IPY_MODEL_835fb9a91b34471fa6d61adf37616f52","value":" 1280000/1280000 [00:22<00:00, 77416.28it/s]"}},"49c5a3fbe87b491cb3f0f450a0af0659":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"510eeffb32694e7798f23e3931d7a943":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_49c5a3fbe87b491cb3f0f450a0af0659","placeholder":"​","style":"IPY_MODEL_252949e8784c4878a62eb2e30b1e3466","value":"Downloading data: 100%"}},"53843f49adda4bce8450fd91fa9fd587":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_849e39cc86f64e558ff94bf542a5121a","placeholder":"​","style":"IPY_MODEL_67b0b03c391c414bad5ea9fb3c947a2f","value":" 1600000/1600000 [01:18<00:00, 14710.70 examples/s]"}},"5455119809c74916acc50e1905903ded":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"55ba92cfe0724286ac1c2bbe6577e5c8":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5ee2a4b33be04c6db8ee4d7995c2376d":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f04df4daeb6049ab85d3d75b472ccf6e","max":498,"min":0,"orientation":"horizontal","style":"IPY_MODEL_fd0b3c53b66543cea0c396d8047445a8","value":498}},"5fa7ab2ab2004e5cb692199e2bd27d6b":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6279343019064572adedf34cfbd437fa":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"67a4fa49ca5349d58512a16a3742d401":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"67b0b03c391c414bad5ea9fb3c947a2f":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6b6cc35257fe433e93736d02e898b6b8":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6b7286d74e0f4a0199dbfcaf3dd0d622":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6d64402d9da74516ab4e1d46ae9f1ee3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_5fa7ab2ab2004e5cb692199e2bd27d6b","max":4033,"min":0,"orientation":"horizontal","style":"IPY_MODEL_ab71bd2b452146829e973d6cf99f31ed","value":4033}},"6e6c5372ffe045c0b72587989567429e":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b54b826314ea4b3a92eebd218c093fc1","placeholder":"​","style":"IPY_MODEL_8cd7be688b8c4818be48915db14a0792","value":"Generating train split: 100%"}},"768da964ffcd44fea1af09e81f5621f3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"78d48ee2fb9f42089f475fcf5fc368c8":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e0fc900d8b5940a6bd6a97e58adb4651","placeholder":"​","style":"IPY_MODEL_6b7286d74e0f4a0199dbfcaf3dd0d622","value":"100%"}},"7bcef602e7f441308472bc145b12dcd3":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7f638a6deacd42e88c031fa47797516b":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"835fb9a91b34471fa6d61adf37616f52":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"84485541f3a14c65a67d10a97b72bbad":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"849e39cc86f64e558ff94bf542a5121a":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8cd7be688b8c4818be48915db14a0792":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"90908b6f69524a72860214ef8bd2d946":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"913d95e58aa94e4a8009768a23fbf304":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"942ce490d87347c789e229589b1b9c9f":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"96c2d7ee644a438982e1792b7ec0453c":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"97fb30a5a31742efa1d188b9361f9938":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"9a7140a6197945d5bac5c48b820dfb04":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"9b34daddb9cc48bba109e547177ec654":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"9baa1a735c0646b89953bf4a7c7fc92c":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a1bd0616199e44538977ee2ea6049690":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a4bbd3df99cd4acab5e1b3ba5cd7c114":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a8b3dfaa2831416582d8eeef01451386":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_7bcef602e7f441308472bc145b12dcd3","max":81363704,"min":0,"orientation":"horizontal","style":"IPY_MODEL_97fb30a5a31742efa1d188b9361f9938","value":81363704}},"a8d69769921241b8b1081e84f7770858":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a9a0f6ce71ed415c8c8513f68e34e162":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ab71bd2b452146829e973d6cf99f31ed":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"afd671543846468abfe37669a72845c3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_057e918ace004506aedc4e4b9942c3a8","IPY_MODEL_325387f6b62d47b0b21bea61676cea72","IPY_MODEL_ea1e3eb0e6ec4f8d82cf9b12cfe6e700"],"layout":"IPY_MODEL_96c2d7ee644a438982e1792b7ec0453c"}},"b0b5cfae51214c60bbca9a09b196c217":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2715d00db77545f9aa5eace8a0eb2839","placeholder":"​","style":"IPY_MODEL_942ce490d87347c789e229589b1b9c9f","value":"Generating test split: 100%"}},"b0ca3012d0b84c5a9d7c1fc176251af7":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a4bbd3df99cd4acab5e1b3ba5cd7c114","max":320000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_9a7140a6197945d5bac5c48b820dfb04","value":320000}},"b54b826314ea4b3a92eebd218c093fc1":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c29e06a72ac9401b8c41f4195021071e":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3f7a8f56f15c434da70029366a37167a","max":1280000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_3610a2db297f4686bf9043f2b7ee55b5","value":1280000}},"c432c4efcb794ce781fcb6f176f1b60d":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_510eeffb32694e7798f23e3931d7a943","IPY_MODEL_a8b3dfaa2831416582d8eeef01451386","IPY_MODEL_db1cdafaf36f4c339476f3221abc17b3"],"layout":"IPY_MODEL_ffd3778a96e046718828bbc5aa73f173"}},"d189f24b0e964d1a9fc86379bad38cca":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"d8de1a85076b453a92295e79110ba8fd":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_78d48ee2fb9f42089f475fcf5fc368c8","IPY_MODEL_b0ca3012d0b84c5a9d7c1fc176251af7","IPY_MODEL_39fa73efcbf54d8dad225d8380061dbf"],"layout":"IPY_MODEL_6b6cc35257fe433e93736d02e898b6b8"}},"d9ca809f7b1c49e595a05458251f3ab2":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_55ba92cfe0724286ac1c2bbe6577e5c8","placeholder":"​","style":"IPY_MODEL_67a4fa49ca5349d58512a16a3742d401","value":" 4.03k/4.03k [00:00<00:00, 114kB/s]"}},"db1cdafaf36f4c339476f3221abc17b3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9b34daddb9cc48bba109e547177ec654","placeholder":"​","style":"IPY_MODEL_fd2b5a6533794a2794579956c25247fb","value":" 81.4M/81.4M [00:06<00:00, 15.3MB/s]"}},"db432a2cd6244a7592fc9732f0ca4738":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"db9bf44dec914db793cc4f73751c272c":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"df7eba182d1b4c21bc21d157eac6b996":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_db432a2cd6244a7592fc9732f0ca4738","placeholder":"​","style":"IPY_MODEL_84485541f3a14c65a67d10a97b72bbad","value":"Downloading builder script: 100%"}},"e0fc900d8b5940a6bd6a97e58adb4651":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e1348a02ceeb4af19fbd63d52b7d843b":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_fbf51b14e6b34d0485ddf59c43d22c49","IPY_MODEL_c29e06a72ac9401b8c41f4195021071e","IPY_MODEL_48b812211db04284bfbbf02823fb879a"],"layout":"IPY_MODEL_5455119809c74916acc50e1905903ded"}},"ea1e3eb0e6ec4f8d82cf9b12cfe6e700":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_db9bf44dec914db793cc4f73751c272c","placeholder":"​","style":"IPY_MODEL_1cf3ba0f756f4aa5ad1dcb675a791cfa","value":" 6.84k/6.84k [00:00<00:00, 157kB/s]"}},"f04df4daeb6049ab85d3d75b472ccf6e":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f3a9667c8c994324a2409f227bd0a1e9":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_6e6c5372ffe045c0b72587989567429e","IPY_MODEL_2ead0216695e4227aef44552f4ec3cc9","IPY_MODEL_53843f49adda4bce8450fd91fa9fd587"],"layout":"IPY_MODEL_40262cb3eefa45fcbe37aaafccb69f5f"}},"fbb4191426bd485e8e965b6d432eecae":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_df7eba182d1b4c21bc21d157eac6b996","IPY_MODEL_6d64402d9da74516ab4e1d46ae9f1ee3","IPY_MODEL_d9ca809f7b1c49e595a05458251f3ab2"],"layout":"IPY_MODEL_90908b6f69524a72860214ef8bd2d946"}},"fbf51b14e6b34d0485ddf59c43d22c49":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2475bd62a3224bacb38a6334d07d6a8c","placeholder":"​","style":"IPY_MODEL_3d29947b5d2d4e2abc1355d900096642","value":"100%"}},"fd0b3c53b66543cea0c396d8047445a8":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"fd2b5a6533794a2794579956c25247fb":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ffd3778a96e046718828bbc5aa73f173":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}}}}},"nbformat":4,"nbformat_minor":0} From 288633f4382a3b4ad15e8109589dfaf41940b281 Mon Sep 17 00:00:00 2001 From: Soan Kim <39689481+SoanKim@users.noreply.github.com> Date: Wed, 3 Jul 2024 18:41:04 +0900 Subject: [PATCH 04/12] =?UTF-8?q?#=20blurry=5Fvision.ipynb:=20=E2=80=9Ccat?= =?UTF-8?q?s-and-dogs.zip=E2=80=9D=20has=20zero=20bytes,=20and=20the=20dow?= =?UTF-8?q?nloadable=20link=20doesn=E2=80=99t=20exist.=20404=20-=20File=20?= =?UTF-8?q?or=20directory=20not=20found?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # cellular_segmentation.ipynb: "numpy long type issue" # lunar_lanter: swig should be installed before gym[box2d] to avoid the error --- projects/Neuroscience/blurry_vision.ipynb | 4 ++-- projects/Neuroscience/cellular_segmentation.ipynb | 9 ++++++--- projects/ReinforcementLearning/lunar_lander.ipynb | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/projects/Neuroscience/blurry_vision.ipynb b/projects/Neuroscience/blurry_vision.ipynb index 320e6d77d..ee68c5f6d 100644 --- a/projects/Neuroscience/blurry_vision.ipynb +++ b/projects/Neuroscience/blurry_vision.ipynb @@ -60,7 +60,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " Building wheel for torch-intermediate-layer-getter (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + " Building wheel for torch-intermediate-layer-getter (setup.py) ... \u001B[?25l\u001B[?25hdone\n" ] } ], @@ -1877,7 +1877,7 @@ "# # Download the Data\n", "# if \"cats-and-dogs.zip\" not in os.listdir():\n", "# !wget --no-check-certificate \\\n", - "# \"https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip\" \\\n", + "# \"https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip\" \\\n", "# -O \"cats-and-dogs.zip\"\n", "\n", "# local_zip = 'cats-and-dogs.zip'\n", diff --git a/projects/Neuroscience/cellular_segmentation.ipynb b/projects/Neuroscience/cellular_segmentation.ipynb index 7903fa871..39ea6c19b 100644 --- a/projects/Neuroscience/cellular_segmentation.ipynb +++ b/projects/Neuroscience/cellular_segmentation.ipynb @@ -304,14 +304,14 @@ "labels_train = np.zeros((len(masks_train), 2,\n", " masks_train.shape[-2],\n", " masks_train.shape[-1]),\n", - " np.long)\n", + " np.longlong)\n", "labels_train[:, 0] = masks_train == 0\n", "labels_train[:, 1] = masks_train > 0\n", "\n", "labels_test = np.zeros((len(masks_test), 2,\n", " masks_test.shape[-2],\n", " masks_test.shape[-1]),\n", - " np.long)\n", + " np.longlong)\n", "labels_test[:, 0] = masks_test == 0\n", "labels_test[:, 1] = masks_test > 0" ] @@ -831,12 +831,15 @@ " for ibatch in np.arange(0, n_train, batch_size):\n", " # augment the data\n", " inds = np.arange(ibatch, min(n_train, ibatch+batch_size))\n", + " train_data = train_data.astype(np.float32)\n", + " train_labels = train_labels.astype(np.float32)\n", " imgs, lbls, _ = random_rotate_and_resize(train_data[inds],\n", " train_labels[inds])\n", "\n", " # transfer to torch + GPU\n", " imgs = torch.from_numpy(imgs).to(device=device)\n", " lbls = torch.from_numpy(lbls).to(device=device)\n", + " lbls = lbls.long()\n", "\n", " # compute the loss\n", " y = net(imgs)\n", @@ -1379,7 +1382,7 @@ "from tifffile import imread\n", "\n", "fname = \"gt1.tif\"\n", - "url = \"https://www.suite2p.org/test_data/gt1.tif\"\n", + "url = \"https://www.suite2p.org/test_data/gt1.tif\" # This URL does not exist.\n", "\n", "if not os.path.isfile(fname):\n", " try:\n", diff --git a/projects/ReinforcementLearning/lunar_lander.ipynb b/projects/ReinforcementLearning/lunar_lander.ipynb index 8106a6c89..7622f5b5a 100644 --- a/projects/ReinforcementLearning/lunar_lander.ipynb +++ b/projects/ReinforcementLearning/lunar_lander.ipynb @@ -102,12 +102,12 @@ "!pip install rarfile --quiet\n", "!pip install stable-baselines3[extra] --quiet\n", "!pip install ale-py --quiet\n", + "!pip install -q swig --quiet\n", "!pip install gym[box2d] --quiet\n", "!pip install pyvirtualdisplay --quiet\n", "!pip install pyglet --quiet\n", "!pip install pygame --quiet\n", "!pip install minigrid --quiet\n", - "!pip install -q swig --quiet\n", "!pip install -q gymnasium[box2d] --quiet\n", "!pip install 'minigrid<=2.1.1' --quiet\n", "!pip3 install box2d-py --quiet" From 02925083085e48890ca6b848180484490a68725d Mon Sep 17 00:00:00 2001 From: Soan Kim <39689481+SoanKim@users.noreply.github.com> Date: Sat, 6 Jul 2024 06:50:14 +0900 Subject: [PATCH 05/12] suppressed excessive root-user warning messages --- projects/ReinforcementLearning/human_rl.ipynb | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/projects/ReinforcementLearning/human_rl.ipynb b/projects/ReinforcementLearning/human_rl.ipynb index ae9d35d7e..e72d4367f 100644 --- a/projects/ReinforcementLearning/human_rl.ipynb +++ b/projects/ReinforcementLearning/human_rl.ipynb @@ -64,24 +64,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 1.25.1 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "chex 0.1.81 requires numpy>=1.25.0, but you have numpy 1.23.3 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "numba 0.56.4 requires numpy<1.24,>=1.18, but you have numpy 1.25.1 which is incompatible.\u001B[0m\u001B[31m\n", + "\u001B[0m\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "chex 0.1.81 requires numpy>=1.25.0, but you have numpy 1.23.3 which is incompatible.\u001B[0m\u001B[31m\n", + "\u001B[0m" ] } ], "source": [ "# @title Install dependencies\n", - "!pip install jedi --quiet\n", - "!pip install --upgrade pip setuptools wheel --quiet\n", - "!pip install dm-acme[jax] --quiet\n", - "!pip install dm-sonnet --quiet\n", - "!pip install trfl --quiet\n", - "!pip install numpy==1.23.3 --quiet --ignore-installed\n", - "!pip uninstall seaborn -y --quiet\n", - "!pip install seaborn --quiet" + "!pip install jedi --quiet --root-user-action=ignore\n", + "!pip install --upgrade pip setuptools wheel --quiet --root-user-action=ignore\n", + "!pip install dm-acme[jax] --quiet --root-user-action=ignore\n", + "!pip install dm-sonnet --quiet --root-user-action=ignore\n", + "!pip install trfl --quiet --root-user-action=ignore\n", + "!pip install numpy==1.23.3 --quiet --ignore-installed --root-user-action=ignore\n", + "!pip uninstall seaborn -y --quiet --root-user-action=ignore\n", + "!pip install seaborn --quiet --root-user-action=ignore" ] }, { From 36080adc1539e96ecc4e7ba493dd2199ce7dd996 Mon Sep 17 00:00:00 2001 From: dalia-nasr Date: Sat, 6 Jul 2024 17:55:20 +0300 Subject: [PATCH 06/12] restarted kernel and cleared output cells --- .../sentiment_analysis.ipynb | 4641 ++++++++++++++++- 1 file changed, 4640 insertions(+), 1 deletion(-) diff --git a/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb b/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb index a6f073666..eb393529a 100644 --- a/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb +++ b/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb @@ -1 +1,4640 @@ -{"cells":[{"cell_type":"markdown","metadata":{"execution":{},"id":"view-in-github"},"source":["\"Open   \"Open"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"D_fgc45VfjDz"},"source":["# Twitter Sentiment Analysis\n","\n","**By Neuromatch Academy**\n","\n","__Content creators:__ Juan Manuel Rodriguez, Salomey Osei, Gonzalo Uribarri\n","\n","__Production editors:__ Amita Kapoor, Spiros Chavlis"]},{"cell_type":"markdown","metadata":{"execution":{}},"source":["---\n","# Welcome to the NLP project template\n","\n",""]},{"cell_type":"markdown","metadata":{"execution":{}},"source":["---\n","# Step 1: Questions and goals\n","\n","* Can we infer emotion from a tweet text?\n","* How words are distributed accross the dataset?\n","* Are words related to one kind of emotion?"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"Vd1qdNW9fjD1"},"source":["---\n","# Step 2: Literature review\n","\n","[Original Dataset Paper](https://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf)\n","\n","[Papers with code](https://paperswithcode.com/dataset/imdb-movie-reviews)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"oOYDQElpfjD2"},"source":["---\n","# Step 3: Load and explore the dataset"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":103706,"status":"ok","timestamp":1720042135196,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"EZpxSExUfjD2","outputId":"19b01445-9b83-4a93-9cc2-7830ab0dcf5b"},"outputs":[],"source":["# @title Install dependencies\n","!pip install pandas --quiet\n","!pip install torchtext --quiet\n","!pip install datasets --quiet"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":9008,"status":"ok","timestamp":1720042144200,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"DxqD3Tk5fjD3","outputId":"451d68c5-7894-4f93-9f54-bf0b7f482e20"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.10/dist-packages/torchtext/data/__init__.py:4: UserWarning: \n","/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n","Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n"," warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n"]}],"source":["# We import some libraries to load the dataset\n","import os\n","import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","\n","from datasets import load_dataset\n","\n","from collections import Counter\n","from tqdm.notebook import tqdm\n","\n","import torch\n","import torch.nn as nn\n","import torch.optim as optim\n","import torch.nn.functional as F\n","from torch.utils.data import TensorDataset, DataLoader\n","\n","import torchtext\n","from torchtext.data import get_tokenizer\n","\n","from sklearn.utils import shuffle\n","from sklearn.metrics import classification_report\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.model_selection import train_test_split\n","from sklearn.feature_extraction.text import CountVectorizer"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"63Eg1SLbfjD4"},"source":["You can find the dataset we are going to use in [this website](http://help.sentiment140.com/for-students/)."]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":567,"referenced_widgets":["fbb4191426bd485e8e965b6d432eecae","df7eba182d1b4c21bc21d157eac6b996","6d64402d9da74516ab4e1d46ae9f1ee3","d9ca809f7b1c49e595a05458251f3ab2","90908b6f69524a72860214ef8bd2d946","db432a2cd6244a7592fc9732f0ca4738","84485541f3a14c65a67d10a97b72bbad","5fa7ab2ab2004e5cb692199e2bd27d6b","ab71bd2b452146829e973d6cf99f31ed","55ba92cfe0724286ac1c2bbe6577e5c8","67a4fa49ca5349d58512a16a3742d401","afd671543846468abfe37669a72845c3","057e918ace004506aedc4e4b9942c3a8","325387f6b62d47b0b21bea61676cea72","ea1e3eb0e6ec4f8d82cf9b12cfe6e700","96c2d7ee644a438982e1792b7ec0453c","9baa1a735c0646b89953bf4a7c7fc92c","0ac9711c8ece4c5397a8cd810713adfb","a8d69769921241b8b1081e84f7770858","d189f24b0e964d1a9fc86379bad38cca","db9bf44dec914db793cc4f73751c272c","1cf3ba0f756f4aa5ad1dcb675a791cfa","c432c4efcb794ce781fcb6f176f1b60d","510eeffb32694e7798f23e3931d7a943","a8b3dfaa2831416582d8eeef01451386","db1cdafaf36f4c339476f3221abc17b3","ffd3778a96e046718828bbc5aa73f173","49c5a3fbe87b491cb3f0f450a0af0659","252949e8784c4878a62eb2e30b1e3466","7bcef602e7f441308472bc145b12dcd3","97fb30a5a31742efa1d188b9361f9938","9b34daddb9cc48bba109e547177ec654","fd2b5a6533794a2794579956c25247fb","f3a9667c8c994324a2409f227bd0a1e9","6e6c5372ffe045c0b72587989567429e","2ead0216695e4227aef44552f4ec3cc9","53843f49adda4bce8450fd91fa9fd587","40262cb3eefa45fcbe37aaafccb69f5f","b54b826314ea4b3a92eebd218c093fc1","8cd7be688b8c4818be48915db14a0792","a9a0f6ce71ed415c8c8513f68e34e162","7f638a6deacd42e88c031fa47797516b","849e39cc86f64e558ff94bf542a5121a","67b0b03c391c414bad5ea9fb3c947a2f","1cef38981af6457dbaeb393f9936a389","b0b5cfae51214c60bbca9a09b196c217","5ee2a4b33be04c6db8ee4d7995c2376d","403fffb635c2409ebeabc90063750ed3","6279343019064572adedf34cfbd437fa","2715d00db77545f9aa5eace8a0eb2839","942ce490d87347c789e229589b1b9c9f","f04df4daeb6049ab85d3d75b472ccf6e","fd0b3c53b66543cea0c396d8047445a8","2c42e2fef6314c9e842a7e9641af3cab","913d95e58aa94e4a8009768a23fbf304"]},"execution":{},"executionInfo":{"elapsed":189390,"status":"ok","timestamp":1720042333586,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"3HLOsd3rfjD4","outputId":"7653fee1-a871-472b-a978-d8ec0250dc84"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"fbb4191426bd485e8e965b6d432eecae","version_major":2,"version_minor":0},"text/plain":["Downloading builder script: 0%| | 0.00/4.03k [00:00\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
polarityuserdatequeryusertext
00_TheSpecialOne_Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
10scotthamiltonMon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
20mattycusMon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
30ElleCTFMon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
40KaroliMon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n"," \n"],"text/plain":[" polarity user date query \\\n","0 0 _TheSpecialOne_ Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n","1 0 scotthamilton Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n","2 0 mattycus Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n","3 0 ElleCTF Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n","4 0 Karoli Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n","\n"," user text \n","0 _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t... \n","1 scotthamilton is upset that he can't update his Facebook by ... \n","2 mattycus @Kenichan I dived many times for the ball. Man... \n","3 ElleCTF my whole body feels itchy and like its on fire \n","4 Karoli @nationwideclass no, it's not behaving at all.... "]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["# We load the dataset\n","\n","dataset = load_dataset(\"stanfordnlp/sentiment140\", trust_remote_code= True)\n","\n","train_data = dataset[\"train\"]\n","df = pd.DataFrame(train_data)\n","df = df.rename(columns={'sentiment': 'polarity'})\n","df = df[['polarity', 'user', 'date', 'query', 'user', 'text']]\n","df.head()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"fuKShcfjfjD4"},"source":["For this project we will use only the text and the polarity of the tweet. Notice that polarity is 0 for negative tweets and 4 for positive tweet."]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{},"executionInfo":{"elapsed":1059,"status":"ok","timestamp":1720042334642,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"GXHQOn6gfjD5"},"outputs":[],"source":["X = df.text.values\n","\n","# Changes values from [0,4] to [0,1]\n","y = (df.polarity.values > 1).astype(int)\n","\n","\n","# Split the data into train and test\n","x_train_text, x_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"7kr3TO_LfjD5"},"source":["The first thing we have to do before working on the models is to familiarize ourselves with the dataset. This is called Exploratory Data Analisys (EDA)."]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":12,"status":"ok","timestamp":1720042334642,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"FsL-xY03fjD5","outputId":"655f0ef8-c177-4f42-c024-1d628241401a"},"outputs":[{"name":"stdout","output_type":"stream","text":["1: @paisleypaisley LOL why do i get ideas so far in advance? it's not even june yet! we need a third knitter to have our own summer group \n","0: worst headache ever \n","0: @ewaniesciuszko i am so sad i wont see you! I miss you already. and yeah! that's perfect; i come back the 18th!\n","1: doesn't know how to spell conked \n","0: "So we stand here now and no one knows us at all I won't get used to this I won't get used to being gone"...I miss home and everyone -a\n"]}],"source":["for s, l in zip(x_train_text[:5], y_train[:5]):\n"," print('{}: {}'.format(l, s))"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"4cPGXSc-fjD5"},"source":["An interesting thing to analyze is the Word Distribution. In order to count the occurrences of each word, we should tokenize the sentences first."]},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":9,"status":"ok","timestamp":1720042334642,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"U1OugpZ0fjD5","outputId":"9e6cb4e3-8d8c-4db0-c113-bdd4fe87db5f"},"outputs":[{"name":"stdout","output_type":"stream","text":["Before Tokenize: worst headache ever \n","After Tokenize: ['worst', 'headache', 'ever']\n"]}],"source":["tokenizer = get_tokenizer(\"basic_english\")\n","\n","print('Before Tokenize: ', x_train_text[1])\n","print('After Tokenize: ', tokenizer(x_train_text[1]))"]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":81,"referenced_widgets":["e1348a02ceeb4af19fbd63d52b7d843b","fbf51b14e6b34d0485ddf59c43d22c49","c29e06a72ac9401b8c41f4195021071e","48b812211db04284bfbbf02823fb879a","5455119809c74916acc50e1905903ded","2475bd62a3224bacb38a6334d07d6a8c","3d29947b5d2d4e2abc1355d900096642","3f7a8f56f15c434da70029366a37167a","3610a2db297f4686bf9043f2b7ee55b5","a1bd0616199e44538977ee2ea6049690","835fb9a91b34471fa6d61adf37616f52","d8de1a85076b453a92295e79110ba8fd","78d48ee2fb9f42089f475fcf5fc368c8","b0ca3012d0b84c5a9d7c1fc176251af7","39fa73efcbf54d8dad225d8380061dbf","6b6cc35257fe433e93736d02e898b6b8","e0fc900d8b5940a6bd6a97e58adb4651","6b7286d74e0f4a0199dbfcaf3dd0d622","a4bbd3df99cd4acab5e1b3ba5cd7c114","9a7140a6197945d5bac5c48b820dfb04","0bdc146792a64853ae06a9d185aa2b15","768da964ffcd44fea1af09e81f5621f3"]},"execution":{},"executionInfo":{"elapsed":29122,"status":"ok","timestamp":1720042363757,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"7ZggzGCXfjD6","outputId":"ae19f8d6-224d-4224-d3a0-d00c659ec9b2"},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"e1348a02ceeb4af19fbd63d52b7d843b","version_major":2,"version_minor":0},"text/plain":[" 0%| | 0/1280000 [00:00"]},"metadata":{},"output_type":"display_data"}],"source":["plt.bar(range(100), [words[w] for w in sorted_words[:100]])\n","plt.show()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"o9IYA0cZfjD7"},"source":["It is very common to find this kind of distribution when analyzing corpus of text. This is referred to as the [zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law)."]},{"cell_type":"markdown","metadata":{"execution":{},"id":"5FQIOqoRfjD7"},"source":["Usually the number of words in the dictionary will be very large.\n","\n","Here are some thing we can do to reduce that number:\n","\n","* Remove puntuation.\n","* Remove stop-words.\n","* Steaming.\n","* Remove very uncommon words (the words that appears in fewer than N occations).\n","* Nothing: we can use a pretrain model that handles this kind of situations.\n","\n","\n","We used one of the simplest tokenizers availables. This tokenizer does not take into account many quirks of the language. Moreover, diferent languages have different quirks, so there is no \"universal\" tokenizers. There are many libraries that have \"better\" tokenizers:\n","\n","* [Spacy](https://spacy.io/): it can be accessed using: `get_tokenizer(\"spacy\")`. Spacy supports a wide range of languages.\n","* [Huggingface](https://huggingface.co/): it has many tokenizers for different laguages. [Doc](https://huggingface.co/transformers/main_classes/tokenizer.html)\n","* [NLTK](https://www.nltk.org/): it provides several tokenizers. One of them can be accessed using: `get_tokenizer(\"toktok\")`\n"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"_ul5MgYcfjD7"},"source":["---\n","# Step 4: choose toolkit\n","\n","Our goal is to train a model capable of estimating the sentiment of a tweet (positive or negative) by reading its content. To that end we will try 2 different approaches:\n","\n","* A logistic regression using sklearn. **NOTE**: it can probaly work better than an SVM model.\n","* A simple Embedding + RNN."]},{"cell_type":"markdown","metadata":{"execution":{},"id":"GteI1PxTfjD7"},"source":["## Logistic regression\n","\n","We will represent our senteces using binary vectorization. This means that our data would be represented as a matrix of instances by word with a one if the word is in the instance, and zero otherwise. Sklean vectorizers can also do things such as stop-word removal and puntuation removal, you can read more about in [the documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)."]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{},"executionInfo":{"elapsed":22699,"status":"ok","timestamp":1720042396408,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"S_ei2qu8fjD7"},"outputs":[],"source":["vectorizer = CountVectorizer(binary=True)\n","x_train_cv = vectorizer.fit_transform(x_train_text)\n","x_test_cv = vectorizer.transform(x_test_text)"]},{"cell_type":"code","execution_count":12,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1720042396409,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"iK_zfqnLfjD7","outputId":"9b3f6db3-01bf-4246-b943-359620c717a2"},"outputs":[{"name":"stdout","output_type":"stream","text":["Before Vectorize: doesn't know how to spell conked \n"]}],"source":["print('Before Vectorize: ', x_train_text[3])"]},{"cell_type":"code","execution_count":13,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720042396409,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"wKxY8e38fjD8","outputId":"19530135-070d-4259-d6a9-7ba06b519763"},"outputs":[{"name":"stdout","output_type":"stream","text":["After Vectorize: \n"," (0, 528584)\t1\n"," (0, 165468)\t1\n"," (0, 300381)\t1\n"," (0, 242211)\t1\n"," (0, 489893)\t1\n"," (0, 134160)\t1\n"]}],"source":["# Notice that the matriz is sparse\n","print('After Vectorize: ')\n","print(x_train_cv[3])"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"QTPPEMd9fjD8"},"source":["Now we can train our model. You can check the documentation of this logistic regressor [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic#sklearn.linear_model.LogisticRegression)."]},{"cell_type":"code","execution_count":14,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":74},"execution":{},"executionInfo":{"elapsed":127277,"status":"ok","timestamp":1720042523682,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"2vEPOQS6fjD8","outputId":"3be77fc0-76e6-40b8-8847-5f6e7c6c0ce0"},"outputs":[{"data":{"text/html":["
LogisticRegression(solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"],"text/plain":["LogisticRegression(solver='saga')"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["model = LogisticRegression(solver='saga')\n","model.fit(x_train_cv, y_train)"]},{"cell_type":"code","execution_count":15,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":7,"status":"ok","timestamp":1720042523683,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"37bUbqB6fjD8","outputId":"7eb9178d-6130-47d0-bdf4-ce4be164bc97"},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.81 0.79 0.80 160000\n"," 1 0.79 0.81 0.80 160000\n","\n"," accuracy 0.80 320000\n"," macro avg 0.80 0.80 0.80 320000\n","weighted avg 0.80 0.80 0.80 320000\n","\n"]}],"source":["y_pred = model.predict(x_test_cv)\n","\n","print(classification_report(y_test, y_pred))"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"161kDLhofjD8"},"source":["## Explainable AI\n","The best thing about logistic regresion is that it is simple, and we can get some explanations."]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":1105,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"EILTmxzifjD9","outputId":"b7ce6853-7385-4a24-d4eb-e6d0843ca5d5"},"outputs":[{"name":"stdout","output_type":"stream","text":["(1, 589260)\n","589260\n"]}],"source":["print(model.coef_.shape)\n","print(len(vectorizer.vocabulary_))\n","\n","words_sk = list(vectorizer.vocabulary_.keys())\n","words_sk.sort(key=lambda w: model.coef_[0, vectorizer.vocabulary_[w]])"]},{"cell_type":"code","execution_count":17,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":12,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"NGjVPON6fjD9","outputId":"d40443bc-476d-4f5a-ce90-4b5b17e47933"},"outputs":[{"name":"stdout","output_type":"stream","text":["roni: -3.8625952420933984\n","inaperfectworld: -3.5734321547933936\n","dontyouhate: -3.5002133484207576\n","xbllygbsn: -3.4126303898325787\n","anqju: -3.3363997631497493\n","sad: -3.200516823534637\n","pakcricket: -3.1949062976331675\n","condolences: -3.132503698316079\n","heartbreaking: -3.0665219866881297\n","saddest: -3.042020604188048\n","sadd: -3.029036146667248\n","heartbroken: -3.0287524416643463\n","boohoo: -3.0226033087262802\n","sadface: -2.991829110065316\n","rachelle_lefevr: -2.925076661509848\n","disappointing: -2.902522686643491\n","lvbu: -2.8947109582208865\n","saddens: -2.8855187276040715\n","bummed: -2.836500453805889\n","neda: -2.792917726280752\n"]}],"source":["for w in words_sk[:20]:\n"," print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))"]},{"cell_type":"code","execution_count":18,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":10,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"oxQ_jlNRfjD9","outputId":"363de58b-817a-4205-f019-2379d0d64e0d"},"outputs":[{"name":"stdout","output_type":"stream","text":["iamsoannoyed: 2.8493838469077013\n","myfax: 2.7974330510971424\n","jennamadison: 2.5667217237933104\n","yeyy: 2.4780234846131646\n","tryout: 2.438315611477797\n","goldymom: 2.4374072779309204\n","wooohooo: 2.402957513257194\n","thesupergirl: 2.356525094856456\n","iammaxathotspot: 2.3116551216589682\n","londicreations: 2.3074264075299316\n","smilin: 2.2991796213822497\n","worries: 2.2899555142510084\n","sinfulsignorita: 2.27989578448778\n","finchensnail: 2.2642827277181063\n","smackthis: 2.237672991997692\n","kv: 2.2157591386122775\n","tojosan: 2.2117938132889696\n","russmarshalek: 2.20953890861265\n","traciknoppe: 2.1768232307222153\n","congratulations: 2.1715901103136876\n"]}],"source":["for w in reversed(words_sk[-20:]):\n"," print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"9KSSAC3qfjD9"},"source":["What does this mean?\n","\n","Remember the `model.coef_` is the $W$ in:\n","\n","$$h(x)=\\sigma(WX + b)$$\n","\n","where the label 1 is a positive tweet and the label 0 is a negative tweet."]},{"cell_type":"markdown","metadata":{"execution":{},"id":"oDHjTP2_fjD9"},"source":["## Recurrent Neural Network with Pytorch"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"TbgpKy95fjD9"},"source":["In the previous section we use a Bag-Of-Words approach to represent each of the tweets. That meas that we only consider how many times each of the words appear in each of the tweets, we didnt take into account the order of the words. But we know that the word order is very important and carries relevant information.\n","\n","In this section we will solve the same task, but this time we will implement a Recurrent Neural Network (RNN) instead of using a simple Logistic Regression.Unlike feedforward neural networks, RNNs have cyclic connections making them powerful for modeling sequences.\n","\n","Let's start by importing the relevant libraries.\n"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"7nmUJV99fjEB"},"outputs":[],"source":["def set_device():\n"," device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n"," if device != \"cuda\":\n"," print(\"WARNING: For this notebook to perform best, \"\n"," \"if possible, in the menu under `Runtime` -> \"\n"," \"`Change runtime type.` select `GPU` \")\n"," else:\n"," print(\"GPU is enabled in this notebook.\")\n","\n"," return device"]},{"cell_type":"code","execution_count":20,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":7,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"chI-18LcfjEB","outputId":"7f633079-6548-48f3-802e-94bc9cfada93"},"outputs":[{"name":"stdout","output_type":"stream","text":["GPU is enabled in this notebook.\n"]}],"source":["# Set the device (check if gpu is available)\n","device = set_device()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"01UtIN7ofjEC"},"source":["First we will create a Dictionary (`word_to_idx`). This dictionary will map each Token (usually words) to an index (an integer number). We want to limit our dictionary to a certain number of tokens (`num_words_dict`), so we will include in our ditionary those with more occurrences."]},{"cell_type":"code","execution_count":21,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720042524784,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"afus9SyUfjEC","outputId":"bb4eb869-e2f0-4ccd-f64c-e55908272345"},"outputs":[{"data":{"text/plain":["['.', 'i', '!', \"'\", 'to', 'the', ',', 'a', 'my', 'it']"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["# From previous section, we have a list with the most used tokens\n","sorted_words[:10]"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"6vfQFjaufjEC"},"source":["Let's select only the most used."]},{"cell_type":"code","execution_count":22,"metadata":{"execution":{},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720042524785,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"tGLkxaGcfjEC"},"outputs":[],"source":["num_words_dict = 30000\n","# We reserve two numbers for special tokens.\n","most_used_words = sorted_words[:num_words_dict-2]"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"AzhQvekCfjEC"},"source":["We will add two extra Tokens to the dictionary, one for words outside the dictionary (`'UNK'`) and one for padding the sequences (`'PAD'`)."]},{"cell_type":"code","execution_count":23,"metadata":{"execution":{},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1720042524785,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"73Wrb-lEfjEC"},"outputs":[],"source":["# dictionary to go from words to idx\n","word_to_idx = {}\n","# dictionary to go from idx to words (just in case)\n","idx_to_word = {}\n","\n","\n","# We include the special tokens first\n","PAD_token = 0\n","UNK_token = 1\n","\n","word_to_idx['PAD'] = PAD_token\n","word_to_idx['UNK'] = UNK_token\n","\n","idx_to_word[PAD_token] = 'PAD'\n","idx_to_word[UNK_token] = 'UNK'\n","\n","# We popullate our dictionaries with the most used words\n","for num,word in enumerate(most_used_words):\n"," word_to_idx[word] = num + 2\n"," idx_to_word[num+2] = word"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"kMHVkEisfjEC"},"source":["Our goal now is to transform each tweet from a sequence of tokens to a sequence of indexes. These sequences of indexes will be the input to our pytorch model."]},{"cell_type":"code","execution_count":24,"metadata":{"execution":{},"executionInfo":{"elapsed":4,"status":"ok","timestamp":1720042524785,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"tkCIu3PKfjED"},"outputs":[],"source":["# A function to convert list of tokens to list of indexes\n","def tokens_to_idx(sentences_tokens,word_to_idx):\n"," sentences_idx = []\n"," for sent in sentences_tokens:\n"," sent_idx = []\n"," for word in sent:\n"," if word in word_to_idx:\n"," sent_idx.append(word_to_idx[word])\n"," else:\n"," sent_idx.append(word_to_idx['UNK'])\n"," sentences_idx.append(sent_idx)\n"," return sentences_idx"]},{"cell_type":"code","execution_count":25,"metadata":{"execution":{},"executionInfo":{"elapsed":9346,"status":"ok","timestamp":1720042534127,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"aHru4vpzfjED"},"outputs":[],"source":["x_train_idx = tokens_to_idx(x_train_token,word_to_idx)\n","x_test_idx = tokens_to_idx(x_test_token,word_to_idx)"]},{"cell_type":"code","execution_count":26,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":8,"status":"ok","timestamp":1720042534127,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"Ofj3OD7zfjED","outputId":"b2788d03-dbfa-41d7-8231-5011206baa59"},"outputs":[{"name":"stdout","output_type":"stream","text":["Before converting: ['worst', 'headache', 'ever']\n","After converting: [721, 458, 237]\n"]}],"source":["some_number = 1\n","print('Before converting: ', x_train_token[some_number])\n","print('After converting: ', x_train_idx[some_number])"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"NcCicvb-fjED"},"source":["We need all the sequences to have the same length. To select an adequate sequence length, let's explore some statistics about the length of the tweets:"]},{"cell_type":"code","execution_count":27,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1720042534128,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"BSjhdyYUfjED","outputId":"82e49be9-7868-44ac-b496-c7a48da1efee"},"outputs":[{"name":"stdout","output_type":"stream","text":["Max tweet word length: 229\n","Mean tweet word length: 15.0\n","99% percent under: 37.0\n"]}],"source":["tweet_lens = np.asarray([len(sentence) for sentence in x_train_idx])\n","print('Max tweet word length: ',tweet_lens.max())\n","print('Mean tweet word length: ',np.median(tweet_lens))\n","print('99% percent under: ',np.quantile(tweet_lens,0.99))"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"t311WY6ZfjEE"},"source":["We cut the sequences which are larger than our chosen maximum length (`max_lenght`) and fill with zeros the ones that are shorter."]},{"cell_type":"code","execution_count":28,"metadata":{"execution":{},"executionInfo":{"elapsed":5,"status":"ok","timestamp":1720042534128,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"r4S8KTWLfjEE"},"outputs":[],"source":[" # We choose the max length\n"," max_length = 40\n","\n","# A function to make all the sequence have the same lenght\n","# Note that the output is a Numpy matrix\n"," def padding(sentences, seq_len):\n"," features = np.zeros((len(sentences), seq_len),dtype=int)\n"," for ii, tweet in enumerate(sentences):\n"," len_tweet = len(tweet)\n"," if len_tweet != 0:\n"," if len_tweet <= seq_len:\n"," # If its shorter, we fill with zeros (the padding Token index)\n"," features[ii, -len(tweet):] = np.array(tweet)[:seq_len]\n"," if len_tweet > seq_len:\n"," # If its larger, we take the last 'seq_len' indexes\n"," features[ii, :] = np.array(tweet)[-seq_len:]\n"," return features"]},{"cell_type":"code","execution_count":29,"metadata":{"execution":{},"executionInfo":{"elapsed":4762,"status":"ok","timestamp":1720042538886,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"Z-Cw-bBxfjEE"},"outputs":[],"source":["# We convert our list of tokens into a numpy matrix\n","# where all instances have the same lenght\n","x_train_pad = padding(x_train_idx,max_length)\n","x_test_pad = padding(x_test_idx,max_length)\n","\n","# We convert our target list a numpy matrix\n","y_train_np = np.asarray(y_train)\n","y_test_np = np.asarray(y_test)"]},{"cell_type":"code","execution_count":30,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":12,"status":"ok","timestamp":1720042538886,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"8eC3YswlfjEE","outputId":"3bb0ea7f-518f-4545-9241-feb783f48122"},"outputs":[{"name":"stdout","output_type":"stream","text":["Before padding: [1, 3, 71, 24, 122, 3, 533, 74, 13, 4, 3, 102, 13, 209, 2, 12, 150, 4, 22, 5, 18, 667, 3, 138, 61, 7, 3296, 4]\n","After padding: [ 0 0 0 0 0 0 0 0 0 0 0 0 1 3\n"," 71 24 122 3 533 74 13 4 3 102 13 209 2 12\n"," 150 4 22 5 18 667 3 138 61 7 3296 4]\n"]}],"source":["some_number = 2\n","print('Before padding: ', x_train_idx[some_number])\n","print('After padding: ', x_train_pad[some_number])"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"SzDhnauUfjEE"},"source":["Now, let's convert the data to pytorch format.\n"]},{"cell_type":"code","execution_count":31,"metadata":{"execution":{},"executionInfo":{"elapsed":10,"status":"ok","timestamp":1720042538886,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"--Yd22YWfjEF"},"outputs":[],"source":["# create Tensor datasets\n","train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train_np))\n","valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test_np))\n","\n","# Batch size (this is an important hyperparameter)\n","batch_size = 100\n","\n","# dataloaders\n","# make sure to SHUFFLE your data\n","train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last = True)\n","valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last = True)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"jQ5qPOWTfjEF"},"source":["Each batch of data in our traning proccess will have the folllowing format:"]},{"cell_type":"code","execution_count":33,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":598,"status":"ok","timestamp":1720042563992,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"S1mqhk1hfjEF","outputId":"c97e7edd-695f-4336-a2e6-f6bed4852a63"},"outputs":[{"name":"stdout","output_type":"stream","text":["Sample input size: torch.Size([100, 40])\n","Sample input: \n"," tensor([[ 0, 0, 0, ..., 32, 203, 86],\n"," [ 0, 0, 0, ..., 1, 1, 4661],\n"," [ 0, 0, 0, ..., 169, 43, 34],\n"," ...,\n"," [ 0, 0, 0, ..., 2, 2961, 4076],\n"," [ 0, 0, 0, ..., 2319, 1325, 2],\n"," [ 0, 0, 0, ..., 7, 253, 1]])\n","Sample input: \n"," tensor([0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,\n"," 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,\n"," 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,\n"," 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,\n"," 0, 1, 0, 1])\n"]}],"source":["# Obtain one batch of training data\n","dataiter = iter(train_loader)\n","sample_x, sample_y = dataiter.__next__()\n","\n","print('Sample input size: ', sample_x.size()) # batch_size, seq_length\n","print('Sample input: \\n', sample_x)\n","print('Sample input: \\n', sample_y)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"jn0PzZdGfjEF"},"source":["Now, we will define the `SentimentRNN` class. Most of the model's class will be familiar to you, but there are two important layers we would like you to pay attention to:\n","\n","* Embedding Layer\n","> This layer is like a linear layer, but it makes it posible to use a sequence of inedexes as inputs (instead of a sequence of one-hot-encoded vectors). During training, the Embedding layer learns a linear transformation from the space of words (a vector space of dimension `num_words_dict`) into the a new, smaller, vector space of dimension `embedding_dim`. We suggest you to read this [thread](https://discuss.pytorch.org/t/how-does-nn-embedding-work/88518/3) and the [pytorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) if you want to learn more about this particular kind of layers.\n","\n","\n","* LSTM layer\n","> This is one of the most used class of Recurrent Neural Networks. In Pytorch we can add several stacked layers in just one line of code. In our case, the number of layers added are decided with the parameter `no_layers`. If you want to learn more about LSTMs we strongly recommend you this [Colahs thread](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) about them.\n","\n","\n","\n","\n","\n"]},{"cell_type":"code","execution_count":34,"metadata":{"execution":{},"executionInfo":{"elapsed":433,"status":"ok","timestamp":1720042567199,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"vfzcowAxfjEF"},"outputs":[],"source":["class SentimentRNN(nn.Module):\n"," def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.1):\n"," super(SentimentRNN,self).__init__()\n","\n"," self.output_dim = output_dim\n"," self.hidden_dim = hidden_dim\n"," self.no_layers = no_layers\n"," self.vocab_size = vocab_size\n"," self.drop_prob = drop_prob\n","\n"," # Embedding Layer\n"," self.embedding = nn.Embedding(vocab_size, embedding_dim)\n","\n"," # LSTM Layers\n"," self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,\n"," num_layers=no_layers, batch_first=True,\n"," dropout=self.drop_prob)\n","\n"," # Dropout layer\n"," self.dropout = nn.Dropout(drop_prob)\n","\n"," # Linear and Sigmoid layer\n"," self.fc = nn.Linear(self.hidden_dim, output_dim)\n"," self.sig = nn.Sigmoid()\n","\n"," def forward(self,x,hidden):\n"," batch_size = x.size(0)\n","\n"," # Embedding out\n"," embeds = self.embedding(x)\n"," #Shape: [batch_size x max_length x embedding_dim]\n","\n"," # LSTM out\n"," lstm_out, hidden = self.lstm(embeds, hidden)\n"," # Shape: [batch_size x max_length x hidden_dim]\n","\n"," # Select the activation of the last Hidden Layer\n"," lstm_out = lstm_out[:,-1,:].contiguous()\n"," # Shape: [batch_size x hidden_dim]\n","\n"," ## You can instead average the activations across all the times\n"," # lstm_out = torch.mean(lstm_out, 1).contiguous()\n","\n"," # Dropout and Fully connected layer\n"," out = self.dropout(lstm_out)\n"," out = self.fc(out)\n","\n"," # Sigmoid function\n"," sig_out = self.sig(out)\n","\n"," # return last sigmoid output and hidden state\n"," return sig_out, hidden\n","\n"," def init_hidden(self, batch_size):\n"," ''' Initializes hidden state '''\n"," # Create two new tensors with sizes n_layers x batch_size x hidden_dim,\n"," # initialized to zero, for hidden state and cell state of LSTM\n"," h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n"," c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n"," hidden = (h0,c0)\n"," return hidden"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"YfrLPa9mfjEF"},"source":["We choose the parameters of the model."]},{"cell_type":"code","execution_count":35,"metadata":{"execution":{},"executionInfo":{"elapsed":471,"status":"ok","timestamp":1720042569608,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"rOm-xoFkfjEG"},"outputs":[],"source":["# Parameters of our network\n","\n","# Size of our vocabulary\n","vocab_size = num_words_dict\n","\n","# Embedding dimension\n","embedding_dim = 32\n","\n","# Number of stacked LSTM layers\n","no_layers = 2\n","\n","# Dimension of the hidden layer in LSTMs\n","hidden_dim = 64\n","\n","# Dropout parameter for regularization\n","output_dim = 1\n","\n","# Dropout parameter for regularization\n","drop_prob = 0.25"]},{"cell_type":"code","execution_count":36,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":465,"status":"ok","timestamp":1720042571776,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"xapxpe7ufjEG","outputId":"51c90159-7d2b-4fc4-f34a-98e9901d40e4"},"outputs":[{"name":"stdout","output_type":"stream","text":["SentimentRNN(\n"," (embedding): Embedding(30000, 32)\n"," (lstm): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.25)\n"," (dropout): Dropout(p=0.25, inplace=False)\n"," (fc): Linear(in_features=64, out_features=1, bias=True)\n"," (sig): Sigmoid()\n",")\n"]}],"source":["# Let's define our model\n","model = SentimentRNN(no_layers, vocab_size, hidden_dim,\n"," embedding_dim, drop_prob=drop_prob)\n","# Moving to gpu\n","model.to(device)\n","print(model)"]},{"cell_type":"code","execution_count":37,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1720042571776,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"GEvTs3uwfjEG","outputId":"2e15f6df-2aa8-4665-b2da-7363d2bfa09e"},"outputs":[{"name":"stdout","output_type":"stream","text":["Total Number of parameters: 1018433\n"]}],"source":["# How many trainable parameters does our model have?\n","model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n","params = sum([np.prod(p.size()) for p in model_parameters])\n","print('Total Number of parameters: ',params)"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"Pc2OC5GDfjEG"},"source":["We choose the losses and the optimizer for the training procces."]},{"cell_type":"code","execution_count":38,"metadata":{"execution":{},"executionInfo":{"elapsed":1740,"status":"ok","timestamp":1720042574210,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"iBWjPADUfjEG"},"outputs":[],"source":["# loss and optimization functions\n","lr = 0.001\n","\n","# Binary crossentropy is a good loss function for a binary classification problem\n","criterion = nn.BCELoss()\n","\n","# We choose an Adam optimizer\n","optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n","\n","# function to predict accuracy\n","def acc(pred,label):\n"," pred = torch.round(pred.squeeze())\n"," return torch.sum(pred == label.squeeze()).item()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"OZgMwOe2fjEG"},"source":["We are ready to train our model."]},{"cell_type":"code","execution_count":39,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"execution":{},"executionInfo":{"elapsed":304614,"status":"ok","timestamp":1720042880244,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"3B6YhEocfjEH","outputId":"76276a1f-7775-4b98-aab0-0e199aa133e4"},"outputs":[{"name":"stdout","output_type":"stream","text":["Epoch 1\n","train_loss : 0.4366412344621494 val_loss : 0.3881208170717582\n","train_accuracy : 79.485546875 val_accuracy : 82.475\n","Validation loss decreased (inf --> 0.388121). Saving model ...\n","==================================================\n","Epoch 2\n","train_loss : 0.3760281792609021 val_loss : 0.3713956154882908\n","train_accuracy : 83.186328125 val_accuracy : 83.4575\n","Validation loss decreased (0.388121 --> 0.371396). Saving model ...\n","==================================================\n","Epoch 3\n","train_loss : 0.3574051411205437 val_loss : 0.36425656544510276\n","train_accuracy : 84.19953125 val_accuracy : 83.80375\n","Validation loss decreased (0.371396 --> 0.364257). Saving model ...\n","==================================================\n","Epoch 4\n","train_loss : 0.344456663565943 val_loss : 0.3613302929420024\n","train_accuracy : 84.89265625 val_accuracy : 84.00874999999999\n","Validation loss decreased (0.364257 --> 0.361330). Saving model ...\n","==================================================\n","Epoch 5\n","train_loss : 0.33407817618339325 val_loss : 0.3601334386831149\n","train_accuracy : 85.444921875 val_accuracy : 84.03625\n","Validation loss decreased (0.361330 --> 0.360133). Saving model ...\n","==================================================\n"]}],"source":["# Number of training Epochs\n","epochs = 5\n","\n","# Maximum absolute value accepted for the gradeint\n","clip = 5\n","\n","# Initial Loss value (assumed big)\n","valid_loss_min = np.Inf\n","\n","# Lists to follow the evolution of the loss and accuracy\n","epoch_tr_loss,epoch_vl_loss = [],[]\n","epoch_tr_acc,epoch_vl_acc = [],[]\n","\n","# Train for a number of Epochs\n","for epoch in range(epochs):\n"," train_losses = []\n"," train_acc = 0.0\n"," model.train()\n","\n"," for inputs, labels in train_loader:\n","\n"," # Initialize hidden state\n"," h = model.init_hidden(batch_size)\n"," # Creating new variables for the hidden state\n"," h = tuple([each.data.to(device) for each in h])\n","\n"," # Move batch inputs and labels to gpu\n"," inputs, labels = inputs.to(device), labels.to(device)\n","\n"," # Set gradient to zero\n"," model.zero_grad()\n","\n"," # Compute model output\n"," output,h = model(inputs,h)\n","\n"," # Calculate the loss and perform backprop\n"," loss = criterion(output.squeeze(), labels.float())\n"," loss.backward()\n"," train_losses.append(loss.item())\n","\n"," # calculating accuracy\n"," accuracy = acc(output,labels)\n"," train_acc += accuracy\n","\n"," #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.\n"," nn.utils.clip_grad_norm_(model.parameters(), clip)\n"," optimizer.step()\n","\n","\n"," # Evaluate on the validation set for this epoch\n"," val_losses = []\n"," val_acc = 0.0\n"," model.eval()\n"," for inputs, labels in valid_loader:\n","\n"," # Initialize hidden state\n"," val_h = model.init_hidden(batch_size)\n"," val_h = tuple([each.data.to(device) for each in val_h])\n","\n"," # Move batch inputs and labels to gpu\n"," inputs, labels = inputs.to(device), labels.to(device)\n","\n"," # Compute model output\n"," output, val_h = model(inputs, val_h)\n","\n"," # Compute Loss\n"," val_loss = criterion(output.squeeze(), labels.float())\n","\n"," val_losses.append(val_loss.item())\n","\n"," accuracy = acc(output,labels)\n"," val_acc += accuracy\n","\n"," epoch_train_loss = np.mean(train_losses)\n"," epoch_val_loss = np.mean(val_losses)\n"," epoch_train_acc = train_acc/len(train_loader.dataset)\n"," epoch_val_acc = val_acc/len(valid_loader.dataset)\n"," epoch_tr_loss.append(epoch_train_loss)\n"," epoch_vl_loss.append(epoch_val_loss)\n"," epoch_tr_acc.append(epoch_train_acc)\n"," epoch_vl_acc.append(epoch_val_acc)\n"," print(f'Epoch {epoch+1}')\n"," print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')\n"," print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')\n"," if epoch_val_loss <= valid_loss_min:\n"," print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,epoch_val_loss))\n"," # torch.save(model.state_dict(), '../working/state_dict.pt')\n"," valid_loss_min = epoch_val_loss\n"," print(25*'==')"]},{"cell_type":"code","execution_count":40,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":364},"execution":{},"executionInfo":{"elapsed":894,"status":"ok","timestamp":1720042881135,"user":{"displayName":"Dalia Nasr","userId":"11103095680145801589"},"user_tz":-180},"id":"ttJazP-nfjEH","outputId":"992bed02-611e-4614-c60f-77223d5b801a"},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["fig = plt.figure(figsize = (20, 6))\n","plt.subplot(1, 2, 1)\n","plt.plot(epoch_tr_acc, label='Train Acc')\n","plt.plot(epoch_vl_acc, label='Validation Acc')\n","plt.title(\"Accuracy\")\n","plt.legend()\n","plt.grid()\n","\n","plt.subplot(1, 2, 2)\n","plt.plot(epoch_tr_loss, label='Train loss')\n","plt.plot(epoch_vl_loss, label='Validation loss')\n","plt.title(\"Loss\")\n","plt.legend()\n","plt.grid()\n","\n","plt.show()"]},{"cell_type":"markdown","metadata":{"execution":{},"id":"iUyaF-EbfjEH"},"source":["---\n","# What's Next?\n","\n","You can use this project template as a starting point to think about your own project. There are a lot of ways to continue, here we share with you some ideas you migth find useful:\n","\n","* **Work on the Preproccesing.** We used a very rudimentary way to tokenize tweets. But there are better ways to preprocess the data. Can you think of a suitable way to preprocess the data for this particular task? How does the performance of the model change when the data is processed correctly?\n","* **Work on the Model.** The RNN model proposed in this notebook is not optimized at all. You can work on finding a better architecture or better hyperparamenters. May be using bidirectonal LSTMs or increasing the number of stacked layers can improve the performance, feel free to try different approaches.\n","* **Work on the Embedding.** Our model learnt an embedding during the training on this Twitter corpus for a particular task. You can explore the representation of different words in this learned embedding. Also, you can try using different word embeddings. You can train them on this corpus or you can use an embedding trained on another corpus of data. How does the change of the embedding affect the model performance?\n","* **Try sentiment analysis on another dataset.** There are lots of available dataset to work with, we can help you find one that is interesting to you. Do you belive that a sentiment analysis model trained on some corpus (Twitter dataset) will perform well on another type of data (for example, youtube comments)?\n","\n"]}],"metadata":{"accelerator":"GPU","colab":{"provenance":[],"toc_visible":true},"kernel":{"display_name":"Python 3","language":"python","name":"python3"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.2"},"widgets":{"application/vnd.jupyter.widget-state+json":{"057e918ace004506aedc4e4b9942c3a8":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9baa1a735c0646b89953bf4a7c7fc92c","placeholder":"​","style":"IPY_MODEL_0ac9711c8ece4c5397a8cd810713adfb","value":"Downloading readme: 100%"}},"0ac9711c8ece4c5397a8cd810713adfb":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"0bdc146792a64853ae06a9d185aa2b15":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1cef38981af6457dbaeb393f9936a389":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_b0b5cfae51214c60bbca9a09b196c217","IPY_MODEL_5ee2a4b33be04c6db8ee4d7995c2376d","IPY_MODEL_403fffb635c2409ebeabc90063750ed3"],"layout":"IPY_MODEL_6279343019064572adedf34cfbd437fa"}},"1cf3ba0f756f4aa5ad1dcb675a791cfa":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2475bd62a3224bacb38a6334d07d6a8c":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"252949e8784c4878a62eb2e30b1e3466":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"2715d00db77545f9aa5eace8a0eb2839":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2c42e2fef6314c9e842a7e9641af3cab":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"2ead0216695e4227aef44552f4ec3cc9":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a9a0f6ce71ed415c8c8513f68e34e162","max":1600000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_7f638a6deacd42e88c031fa47797516b","value":1600000}},"325387f6b62d47b0b21bea61676cea72":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a8d69769921241b8b1081e84f7770858","max":6837,"min":0,"orientation":"horizontal","style":"IPY_MODEL_d189f24b0e964d1a9fc86379bad38cca","value":6837}},"3610a2db297f4686bf9043f2b7ee55b5":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"39fa73efcbf54d8dad225d8380061dbf":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0bdc146792a64853ae06a9d185aa2b15","placeholder":"​","style":"IPY_MODEL_768da964ffcd44fea1af09e81f5621f3","value":" 320000/320000 [00:06<00:00, 58691.43it/s]"}},"3d29947b5d2d4e2abc1355d900096642":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3f7a8f56f15c434da70029366a37167a":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"40262cb3eefa45fcbe37aaafccb69f5f":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"403fffb635c2409ebeabc90063750ed3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2c42e2fef6314c9e842a7e9641af3cab","placeholder":"​","style":"IPY_MODEL_913d95e58aa94e4a8009768a23fbf304","value":" 498/498 [00:00<00:00, 7393.07 examples/s]"}},"48b812211db04284bfbbf02823fb879a":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_a1bd0616199e44538977ee2ea6049690","placeholder":"​","style":"IPY_MODEL_835fb9a91b34471fa6d61adf37616f52","value":" 1280000/1280000 [00:22<00:00, 77416.28it/s]"}},"49c5a3fbe87b491cb3f0f450a0af0659":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"510eeffb32694e7798f23e3931d7a943":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_49c5a3fbe87b491cb3f0f450a0af0659","placeholder":"​","style":"IPY_MODEL_252949e8784c4878a62eb2e30b1e3466","value":"Downloading data: 100%"}},"53843f49adda4bce8450fd91fa9fd587":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_849e39cc86f64e558ff94bf542a5121a","placeholder":"​","style":"IPY_MODEL_67b0b03c391c414bad5ea9fb3c947a2f","value":" 1600000/1600000 [01:18<00:00, 14710.70 examples/s]"}},"5455119809c74916acc50e1905903ded":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"55ba92cfe0724286ac1c2bbe6577e5c8":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5ee2a4b33be04c6db8ee4d7995c2376d":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f04df4daeb6049ab85d3d75b472ccf6e","max":498,"min":0,"orientation":"horizontal","style":"IPY_MODEL_fd0b3c53b66543cea0c396d8047445a8","value":498}},"5fa7ab2ab2004e5cb692199e2bd27d6b":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6279343019064572adedf34cfbd437fa":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"67a4fa49ca5349d58512a16a3742d401":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"67b0b03c391c414bad5ea9fb3c947a2f":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6b6cc35257fe433e93736d02e898b6b8":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6b7286d74e0f4a0199dbfcaf3dd0d622":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"6d64402d9da74516ab4e1d46ae9f1ee3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_5fa7ab2ab2004e5cb692199e2bd27d6b","max":4033,"min":0,"orientation":"horizontal","style":"IPY_MODEL_ab71bd2b452146829e973d6cf99f31ed","value":4033}},"6e6c5372ffe045c0b72587989567429e":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_b54b826314ea4b3a92eebd218c093fc1","placeholder":"​","style":"IPY_MODEL_8cd7be688b8c4818be48915db14a0792","value":"Generating train split: 100%"}},"768da964ffcd44fea1af09e81f5621f3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"78d48ee2fb9f42089f475fcf5fc368c8":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e0fc900d8b5940a6bd6a97e58adb4651","placeholder":"​","style":"IPY_MODEL_6b7286d74e0f4a0199dbfcaf3dd0d622","value":"100%"}},"7bcef602e7f441308472bc145b12dcd3":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"7f638a6deacd42e88c031fa47797516b":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"835fb9a91b34471fa6d61adf37616f52":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"84485541f3a14c65a67d10a97b72bbad":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"849e39cc86f64e558ff94bf542a5121a":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"8cd7be688b8c4818be48915db14a0792":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"90908b6f69524a72860214ef8bd2d946":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"913d95e58aa94e4a8009768a23fbf304":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"942ce490d87347c789e229589b1b9c9f":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"96c2d7ee644a438982e1792b7ec0453c":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"97fb30a5a31742efa1d188b9361f9938":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"9a7140a6197945d5bac5c48b820dfb04":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"9b34daddb9cc48bba109e547177ec654":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"9baa1a735c0646b89953bf4a7c7fc92c":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a1bd0616199e44538977ee2ea6049690":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a4bbd3df99cd4acab5e1b3ba5cd7c114":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a8b3dfaa2831416582d8eeef01451386":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_7bcef602e7f441308472bc145b12dcd3","max":81363704,"min":0,"orientation":"horizontal","style":"IPY_MODEL_97fb30a5a31742efa1d188b9361f9938","value":81363704}},"a8d69769921241b8b1081e84f7770858":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"a9a0f6ce71ed415c8c8513f68e34e162":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"ab71bd2b452146829e973d6cf99f31ed":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"afd671543846468abfe37669a72845c3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_057e918ace004506aedc4e4b9942c3a8","IPY_MODEL_325387f6b62d47b0b21bea61676cea72","IPY_MODEL_ea1e3eb0e6ec4f8d82cf9b12cfe6e700"],"layout":"IPY_MODEL_96c2d7ee644a438982e1792b7ec0453c"}},"b0b5cfae51214c60bbca9a09b196c217":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2715d00db77545f9aa5eace8a0eb2839","placeholder":"​","style":"IPY_MODEL_942ce490d87347c789e229589b1b9c9f","value":"Generating test split: 100%"}},"b0ca3012d0b84c5a9d7c1fc176251af7":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a4bbd3df99cd4acab5e1b3ba5cd7c114","max":320000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_9a7140a6197945d5bac5c48b820dfb04","value":320000}},"b54b826314ea4b3a92eebd218c093fc1":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c29e06a72ac9401b8c41f4195021071e":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"FloatProgressModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3f7a8f56f15c434da70029366a37167a","max":1280000,"min":0,"orientation":"horizontal","style":"IPY_MODEL_3610a2db297f4686bf9043f2b7ee55b5","value":1280000}},"c432c4efcb794ce781fcb6f176f1b60d":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_510eeffb32694e7798f23e3931d7a943","IPY_MODEL_a8b3dfaa2831416582d8eeef01451386","IPY_MODEL_db1cdafaf36f4c339476f3221abc17b3"],"layout":"IPY_MODEL_ffd3778a96e046718828bbc5aa73f173"}},"d189f24b0e964d1a9fc86379bad38cca":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"d8de1a85076b453a92295e79110ba8fd":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_78d48ee2fb9f42089f475fcf5fc368c8","IPY_MODEL_b0ca3012d0b84c5a9d7c1fc176251af7","IPY_MODEL_39fa73efcbf54d8dad225d8380061dbf"],"layout":"IPY_MODEL_6b6cc35257fe433e93736d02e898b6b8"}},"d9ca809f7b1c49e595a05458251f3ab2":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_55ba92cfe0724286ac1c2bbe6577e5c8","placeholder":"​","style":"IPY_MODEL_67a4fa49ca5349d58512a16a3742d401","value":" 4.03k/4.03k [00:00<00:00, 114kB/s]"}},"db1cdafaf36f4c339476f3221abc17b3":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_9b34daddb9cc48bba109e547177ec654","placeholder":"​","style":"IPY_MODEL_fd2b5a6533794a2794579956c25247fb","value":" 81.4M/81.4M [00:06<00:00, 15.3MB/s]"}},"db432a2cd6244a7592fc9732f0ca4738":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"db9bf44dec914db793cc4f73751c272c":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"df7eba182d1b4c21bc21d157eac6b996":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_db432a2cd6244a7592fc9732f0ca4738","placeholder":"​","style":"IPY_MODEL_84485541f3a14c65a67d10a97b72bbad","value":"Downloading builder script: 100%"}},"e0fc900d8b5940a6bd6a97e58adb4651":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e1348a02ceeb4af19fbd63d52b7d843b":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_fbf51b14e6b34d0485ddf59c43d22c49","IPY_MODEL_c29e06a72ac9401b8c41f4195021071e","IPY_MODEL_48b812211db04284bfbbf02823fb879a"],"layout":"IPY_MODEL_5455119809c74916acc50e1905903ded"}},"ea1e3eb0e6ec4f8d82cf9b12cfe6e700":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_db9bf44dec914db793cc4f73751c272c","placeholder":"​","style":"IPY_MODEL_1cf3ba0f756f4aa5ad1dcb675a791cfa","value":" 6.84k/6.84k [00:00<00:00, 157kB/s]"}},"f04df4daeb6049ab85d3d75b472ccf6e":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"f3a9667c8c994324a2409f227bd0a1e9":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_6e6c5372ffe045c0b72587989567429e","IPY_MODEL_2ead0216695e4227aef44552f4ec3cc9","IPY_MODEL_53843f49adda4bce8450fd91fa9fd587"],"layout":"IPY_MODEL_40262cb3eefa45fcbe37aaafccb69f5f"}},"fbb4191426bd485e8e965b6d432eecae":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HBoxModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_df7eba182d1b4c21bc21d157eac6b996","IPY_MODEL_6d64402d9da74516ab4e1d46ae9f1ee3","IPY_MODEL_d9ca809f7b1c49e595a05458251f3ab2"],"layout":"IPY_MODEL_90908b6f69524a72860214ef8bd2d946"}},"fbf51b14e6b34d0485ddf59c43d22c49":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"HTMLModel","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2475bd62a3224bacb38a6334d07d6a8c","placeholder":"​","style":"IPY_MODEL_3d29947b5d2d4e2abc1355d900096642","value":"100%"}},"fd0b3c53b66543cea0c396d8047445a8":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"ProgressStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"fd2b5a6533794a2794579956c25247fb":{"model_module":"@jupyter-widgets/controls","model_module_version":"1.5.0","model_name":"DescriptionStyleModel","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ffd3778a96e046718828bbc5aa73f173":{"model_module":"@jupyter-widgets/base","model_module_version":"1.2.0","model_name":"LayoutModel","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}}}}},"nbformat":4,"nbformat_minor":0} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "view-in-github" + }, + "source": [ + "\"Open   \"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "D_fgc45VfjDz" + }, + "source": [ + "# Twitter Sentiment Analysis\n", + "\n", + "**By Neuromatch Academy**\n", + "\n", + "__Content creators:__ Juan Manuel Rodriguez, Salomey Osei, Gonzalo Uribarri\n", + "\n", + "__Production editors:__ Amita Kapoor, Spiros Chavlis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "axvz0SUsfjD0" + }, + "source": [ + "---\n", + "# Welcome to the NLP project template\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "2Vfm0ThbfjD1" + }, + "source": [ + "---\n", + "# Step 1: Questions and goals\n", + "\n", + "* Can we infer emotion from a tweet text?\n", + "* How words are distributed accross the dataset?\n", + "* Are words related to one kind of emotion?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "Vd1qdNW9fjD1" + }, + "source": [ + "---\n", + "# Step 2: Literature review\n", + "\n", + "[Original Dataset Paper](https://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf)\n", + "\n", + "[Papers with code](https://paperswithcode.com/dataset/imdb-movie-reviews)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "oOYDQElpfjD2" + }, + "source": [ + "---\n", + "# Step 3: Load and explore the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EZpxSExUfjD2", + "outputId": "19b01445-9b83-4a93-9cc2-7830ab0dcf5b" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m60.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.8/547.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 MB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.9/64.9 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m20.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 which is incompatible.\n", + "google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.\n", + "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 16.1.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# @title Install dependencies\n", + "!pip install pandas --quiet\n", + "!pip install torchtext --quiet\n", + "!pip install datasets --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DxqD3Tk5fjD3", + "outputId": "451d68c5-7894-4f93-9f54-bf0b7f482e20" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/torchtext/data/__init__.py:4: UserWarning: \n", + "/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n", + "Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n", + " warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n" + ] + } + ], + "source": [ + "# We import some libraries to load the dataset\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from datasets import load_dataset\n", + "\n", + "from collections import Counter\n", + "from tqdm.notebook import tqdm\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import TensorDataset, DataLoader\n", + "\n", + "import torchtext\n", + "from torchtext.data import get_tokenizer\n", + "\n", + "from sklearn.utils import shuffle\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "63Eg1SLbfjD4" + }, + "source": [ + "You can find the dataset we are going to use in [this website](http://help.sentiment140.com/for-students/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/", + "height": 567, + "referenced_widgets": [ + "fbb4191426bd485e8e965b6d432eecae", + "df7eba182d1b4c21bc21d157eac6b996", + "6d64402d9da74516ab4e1d46ae9f1ee3", + "d9ca809f7b1c49e595a05458251f3ab2", + "90908b6f69524a72860214ef8bd2d946", + "db432a2cd6244a7592fc9732f0ca4738", + "84485541f3a14c65a67d10a97b72bbad", + "5fa7ab2ab2004e5cb692199e2bd27d6b", + "ab71bd2b452146829e973d6cf99f31ed", + "55ba92cfe0724286ac1c2bbe6577e5c8", + "67a4fa49ca5349d58512a16a3742d401", + "afd671543846468abfe37669a72845c3", + "057e918ace004506aedc4e4b9942c3a8", + "325387f6b62d47b0b21bea61676cea72", + "ea1e3eb0e6ec4f8d82cf9b12cfe6e700", + "96c2d7ee644a438982e1792b7ec0453c", + "9baa1a735c0646b89953bf4a7c7fc92c", + "0ac9711c8ece4c5397a8cd810713adfb", + "a8d69769921241b8b1081e84f7770858", + "d189f24b0e964d1a9fc86379bad38cca", + "db9bf44dec914db793cc4f73751c272c", + "1cf3ba0f756f4aa5ad1dcb675a791cfa", + "c432c4efcb794ce781fcb6f176f1b60d", + "510eeffb32694e7798f23e3931d7a943", + "a8b3dfaa2831416582d8eeef01451386", + "db1cdafaf36f4c339476f3221abc17b3", + "ffd3778a96e046718828bbc5aa73f173", + "49c5a3fbe87b491cb3f0f450a0af0659", + "252949e8784c4878a62eb2e30b1e3466", + "7bcef602e7f441308472bc145b12dcd3", + "97fb30a5a31742efa1d188b9361f9938", + "9b34daddb9cc48bba109e547177ec654", + "fd2b5a6533794a2794579956c25247fb", + "f3a9667c8c994324a2409f227bd0a1e9", + "6e6c5372ffe045c0b72587989567429e", + "2ead0216695e4227aef44552f4ec3cc9", + "53843f49adda4bce8450fd91fa9fd587", + "40262cb3eefa45fcbe37aaafccb69f5f", + "b54b826314ea4b3a92eebd218c093fc1", + "8cd7be688b8c4818be48915db14a0792", + "a9a0f6ce71ed415c8c8513f68e34e162", + "7f638a6deacd42e88c031fa47797516b", + "849e39cc86f64e558ff94bf542a5121a", + "67b0b03c391c414bad5ea9fb3c947a2f", + "1cef38981af6457dbaeb393f9936a389", + "b0b5cfae51214c60bbca9a09b196c217", + "5ee2a4b33be04c6db8ee4d7995c2376d", + "403fffb635c2409ebeabc90063750ed3", + "6279343019064572adedf34cfbd437fa", + "2715d00db77545f9aa5eace8a0eb2839", + "942ce490d87347c789e229589b1b9c9f", + "f04df4daeb6049ab85d3d75b472ccf6e", + "fd0b3c53b66543cea0c396d8047445a8", + "2c42e2fef6314c9e842a7e9641af3cab", + "913d95e58aa94e4a8009768a23fbf304" + ] + }, + "id": "3HLOsd3rfjD4", + "outputId": "7653fee1-a871-472b-a978-d8ec0250dc84" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading builder script: 0%| | 0.00/4.03k [00:00\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
polarityuserdatequeryusertext
00_TheSpecialOne_Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
10scotthamiltonMon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
20mattycusMon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
30ElleCTFMon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
40KaroliMon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "# We load the dataset\n", + "\n", + "dataset = load_dataset(\"stanfordnlp/sentiment140\", trust_remote_code= True)\n", + "\n", + "train_data = dataset[\"train\"]\n", + "df = pd.DataFrame(train_data)\n", + "df = df.rename(columns={'sentiment': 'polarity'})\n", + "df = df[['polarity', 'user', 'date', 'query', 'user', 'text']]\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "fuKShcfjfjD4" + }, + "source": [ + "For this project we will use only the text and the polarity of the tweet. Notice that polarity is 0 for negative tweets and 4 for positive tweet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "GXHQOn6gfjD5" + }, + "outputs": [], + "source": [ + "X = df.text.values\n", + "\n", + "# Changes values from [0,4] to [0,1]\n", + "y = (df.polarity.values > 1).astype(int)\n", + "\n", + "\n", + "# Split the data into train and test\n", + "x_train_text, x_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "7kr3TO_LfjD5" + }, + "source": [ + "The first thing we have to do before working on the models is to familiarize ourselves with the dataset. This is called Exploratory Data Analisys (EDA)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FsL-xY03fjD5", + "outputId": "655f0ef8-c177-4f42-c024-1d628241401a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1: @paisleypaisley LOL why do i get ideas so far in advance? it's not even june yet! we need a third knitter to have our own summer group \n", + "0: worst headache ever \n", + "0: @ewaniesciuszko i am so sad i wont see you! I miss you already. and yeah! that's perfect; i come back the 18th!\n", + "1: doesn't know how to spell conked \n", + "0: "So we stand here now and no one knows us at all I won't get used to this I won't get used to being gone"...I miss home and everyone -a\n" + ] + } + ], + "source": [ + "for s, l in zip(x_train_text[:5], y_train[:5]):\n", + " print('{}: {}'.format(l, s))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "4cPGXSc-fjD5" + }, + "source": [ + "An interesting thing to analyze is the Word Distribution. In order to count the occurrences of each word, we should tokenize the sentences first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "U1OugpZ0fjD5", + "outputId": "9e6cb4e3-8d8c-4db0-c113-bdd4fe87db5f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Before Tokenize: worst headache ever \n", + "After Tokenize: ['worst', 'headache', 'ever']\n" + ] + } + ], + "source": [ + "tokenizer = get_tokenizer(\"basic_english\")\n", + "\n", + "print('Before Tokenize: ', x_train_text[1])\n", + "print('After Tokenize: ', tokenizer(x_train_text[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81, + "referenced_widgets": [ + "e1348a02ceeb4af19fbd63d52b7d843b", + "fbf51b14e6b34d0485ddf59c43d22c49", + "c29e06a72ac9401b8c41f4195021071e", + "48b812211db04284bfbbf02823fb879a", + "5455119809c74916acc50e1905903ded", + "2475bd62a3224bacb38a6334d07d6a8c", + "3d29947b5d2d4e2abc1355d900096642", + "3f7a8f56f15c434da70029366a37167a", + "3610a2db297f4686bf9043f2b7ee55b5", + "a1bd0616199e44538977ee2ea6049690", + "835fb9a91b34471fa6d61adf37616f52", + "d8de1a85076b453a92295e79110ba8fd", + "78d48ee2fb9f42089f475fcf5fc368c8", + "b0ca3012d0b84c5a9d7c1fc176251af7", + "39fa73efcbf54d8dad225d8380061dbf", + "6b6cc35257fe433e93736d02e898b6b8", + "e0fc900d8b5940a6bd6a97e58adb4651", + "6b7286d74e0f4a0199dbfcaf3dd0d622", + "a4bbd3df99cd4acab5e1b3ba5cd7c114", + "9a7140a6197945d5bac5c48b820dfb04", + "0bdc146792a64853ae06a9d185aa2b15", + "768da964ffcd44fea1af09e81f5621f3" + ] + }, + "id": "7ZggzGCXfjD6", + "outputId": "ae19f8d6-224d-4224-d3a0-d00c659ec9b2" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/1280000 [00:00" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "plt.bar(range(100), [words[w] for w in sorted_words[:100]])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "o9IYA0cZfjD7" + }, + "source": [ + "It is very common to find this kind of distribution when analyzing corpus of text. This is referred to as the [zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "5FQIOqoRfjD7" + }, + "source": [ + "Usually the number of words in the dictionary will be very large.\n", + "\n", + "Here are some thing we can do to reduce that number:\n", + "\n", + "* Remove puntuation.\n", + "* Remove stop-words.\n", + "* Steaming.\n", + "* Remove very uncommon words (the words that appears in fewer than N occations).\n", + "* Nothing: we can use a pretrain model that handles this kind of situations.\n", + "\n", + "\n", + "We used one of the simplest tokenizers availables. This tokenizer does not take into account many quirks of the language. Moreover, diferent languages have different quirks, so there is no \"universal\" tokenizers. There are many libraries that have \"better\" tokenizers:\n", + "\n", + "* [Spacy](https://spacy.io/): it can be accessed using: `get_tokenizer(\"spacy\")`. Spacy supports a wide range of languages.\n", + "* [Huggingface](https://huggingface.co/): it has many tokenizers for different laguages. [Doc](https://huggingface.co/transformers/main_classes/tokenizer.html)\n", + "* [NLTK](https://www.nltk.org/): it provides several tokenizers. One of them can be accessed using: `get_tokenizer(\"toktok\")`\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "_ul5MgYcfjD7" + }, + "source": [ + "---\n", + "# Step 4: choose toolkit\n", + "\n", + "Our goal is to train a model capable of estimating the sentiment of a tweet (positive or negative) by reading its content. To that end we will try 2 different approaches:\n", + "\n", + "* A logistic regression using sklearn. **NOTE**: it can probaly work better than an SVM model.\n", + "* A simple Embedding + RNN." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "GteI1PxTfjD7" + }, + "source": [ + "## Logistic regression\n", + "\n", + "We will represent our senteces using binary vectorization. This means that our data would be represented as a matrix of instances by word with a one if the word is in the instance, and zero otherwise. Sklean vectorizers can also do things such as stop-word removal and puntuation removal, you can read more about in [the documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "S_ei2qu8fjD7" + }, + "outputs": [], + "source": [ + "vectorizer = CountVectorizer(binary=True)\n", + "x_train_cv = vectorizer.fit_transform(x_train_text)\n", + "x_test_cv = vectorizer.transform(x_test_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iK_zfqnLfjD7", + "outputId": "9b3f6db3-01bf-4246-b943-359620c717a2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Before Vectorize: doesn't know how to spell conked \n" + ] + } + ], + "source": [ + "print('Before Vectorize: ', x_train_text[3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wKxY8e38fjD8", + "outputId": "19530135-070d-4259-d6a9-7ba06b519763" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "After Vectorize: \n", + " (0, 528584)\t1\n", + " (0, 165468)\t1\n", + " (0, 300381)\t1\n", + " (0, 242211)\t1\n", + " (0, 489893)\t1\n", + " (0, 134160)\t1\n" + ] + } + ], + "source": [ + "# Notice that the matriz is sparse\n", + "print('After Vectorize: ')\n", + "print(x_train_cv[3])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "QTPPEMd9fjD8" + }, + "source": [ + "Now we can train our model. You can check the documentation of this logistic regressor [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic#sklearn.linear_model.LogisticRegression)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/", + "height": 74 + }, + "id": "2vEPOQS6fjD8", + "outputId": "3be77fc0-76e6-40b8-8847-5f6e7c6c0ce0" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LogisticRegression(solver='saga')" + ], + "text/html": [ + "
LogisticRegression(solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ], + "source": [ + "model = LogisticRegression(solver='saga')\n", + "model.fit(x_train_cv, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "37bUbqB6fjD8", + "outputId": "7eb9178d-6130-47d0-bdf4-ce4be164bc97" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.81 0.79 0.80 160000\n", + " 1 0.79 0.81 0.80 160000\n", + "\n", + " accuracy 0.80 320000\n", + " macro avg 0.80 0.80 0.80 320000\n", + "weighted avg 0.80 0.80 0.80 320000\n", + "\n" + ] + } + ], + "source": [ + "y_pred = model.predict(x_test_cv)\n", + "\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "161kDLhofjD8" + }, + "source": [ + "## Explainable AI\n", + "The best thing about logistic regresion is that it is simple, and we can get some explanations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EILTmxzifjD9", + "outputId": "b7ce6853-7385-4a24-d4eb-e6d0843ca5d5" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(1, 589260)\n", + "589260\n" + ] + } + ], + "source": [ + "print(model.coef_.shape)\n", + "print(len(vectorizer.vocabulary_))\n", + "\n", + "words_sk = list(vectorizer.vocabulary_.keys())\n", + "words_sk.sort(key=lambda w: model.coef_[0, vectorizer.vocabulary_[w]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NGjVPON6fjD9", + "outputId": "d40443bc-476d-4f5a-ce90-4b5b17e47933" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "roni: -3.8625952420933984\n", + "inaperfectworld: -3.5734321547933936\n", + "dontyouhate: -3.5002133484207576\n", + "xbllygbsn: -3.4126303898325787\n", + "anqju: -3.3363997631497493\n", + "sad: -3.200516823534637\n", + "pakcricket: -3.1949062976331675\n", + "condolences: -3.132503698316079\n", + "heartbreaking: -3.0665219866881297\n", + "saddest: -3.042020604188048\n", + "sadd: -3.029036146667248\n", + "heartbroken: -3.0287524416643463\n", + "boohoo: -3.0226033087262802\n", + "sadface: -2.991829110065316\n", + "rachelle_lefevr: -2.925076661509848\n", + "disappointing: -2.902522686643491\n", + "lvbu: -2.8947109582208865\n", + "saddens: -2.8855187276040715\n", + "bummed: -2.836500453805889\n", + "neda: -2.792917726280752\n" + ] + } + ], + "source": [ + "for w in words_sk[:20]:\n", + " print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oxQ_jlNRfjD9", + "outputId": "363de58b-817a-4205-f019-2379d0d64e0d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "iamsoannoyed: 2.8493838469077013\n", + "myfax: 2.7974330510971424\n", + "jennamadison: 2.5667217237933104\n", + "yeyy: 2.4780234846131646\n", + "tryout: 2.438315611477797\n", + "goldymom: 2.4374072779309204\n", + "wooohooo: 2.402957513257194\n", + "thesupergirl: 2.356525094856456\n", + "iammaxathotspot: 2.3116551216589682\n", + "londicreations: 2.3074264075299316\n", + "smilin: 2.2991796213822497\n", + "worries: 2.2899555142510084\n", + "sinfulsignorita: 2.27989578448778\n", + "finchensnail: 2.2642827277181063\n", + "smackthis: 2.237672991997692\n", + "kv: 2.2157591386122775\n", + "tojosan: 2.2117938132889696\n", + "russmarshalek: 2.20953890861265\n", + "traciknoppe: 2.1768232307222153\n", + "congratulations: 2.1715901103136876\n" + ] + } + ], + "source": [ + "for w in reversed(words_sk[-20:]):\n", + " print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "9KSSAC3qfjD9" + }, + "source": [ + "What does this mean?\n", + "\n", + "Remember the `model.coef_` is the $W$ in:\n", + "\n", + "$$h(x)=\\sigma(WX + b)$$\n", + "\n", + "where the label 1 is a positive tweet and the label 0 is a negative tweet." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "oDHjTP2_fjD9" + }, + "source": [ + "## Recurrent Neural Network with Pytorch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "TbgpKy95fjD9" + }, + "source": [ + "In the previous section we use a Bag-Of-Words approach to represent each of the tweets. That meas that we only consider how many times each of the words appear in each of the tweets, we didnt take into account the order of the words. But we know that the word order is very important and carries relevant information.\n", + "\n", + "In this section we will solve the same task, but this time we will implement a Recurrent Neural Network (RNN) instead of using a simple Logistic Regression.Unlike feedforward neural networks, RNNs have cyclic connections making them powerful for modeling sequences.\n", + "\n", + "Let's start by importing the relevant libraries.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "7nmUJV99fjEB" + }, + "outputs": [], + "source": [ + "def set_device():\n", + " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + " if device != \"cuda\":\n", + " print(\"WARNING: For this notebook to perform best, \"\n", + " \"if possible, in the menu under `Runtime` -> \"\n", + " \"`Change runtime type.` select `GPU` \")\n", + " else:\n", + " print(\"GPU is enabled in this notebook.\")\n", + "\n", + " return device" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "chI-18LcfjEB", + "outputId": "7f633079-6548-48f3-802e-94bc9cfada93" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "GPU is enabled in this notebook.\n" + ] + } + ], + "source": [ + "# Set the device (check if gpu is available)\n", + "device = set_device()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "01UtIN7ofjEC" + }, + "source": [ + "First we will create a Dictionary (`word_to_idx`). This dictionary will map each Token (usually words) to an index (an integer number). We want to limit our dictionary to a certain number of tokens (`num_words_dict`), so we will include in our ditionary those with more occurrences." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "afus9SyUfjEC", + "outputId": "bb4eb869-e2f0-4ccd-f64c-e55908272345" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['.', 'i', '!', \"'\", 'to', 'the', ',', 'a', 'my', 'it']" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ], + "source": [ + "# From previous section, we have a list with the most used tokens\n", + "sorted_words[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "6vfQFjaufjEC" + }, + "source": [ + "Let's select only the most used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "tGLkxaGcfjEC" + }, + "outputs": [], + "source": [ + "num_words_dict = 30000\n", + "# We reserve two numbers for special tokens.\n", + "most_used_words = sorted_words[:num_words_dict-2]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "AzhQvekCfjEC" + }, + "source": [ + "We will add two extra Tokens to the dictionary, one for words outside the dictionary (`'UNK'`) and one for padding the sequences (`'PAD'`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "73Wrb-lEfjEC" + }, + "outputs": [], + "source": [ + "# dictionary to go from words to idx\n", + "word_to_idx = {}\n", + "# dictionary to go from idx to words (just in case)\n", + "idx_to_word = {}\n", + "\n", + "\n", + "# We include the special tokens first\n", + "PAD_token = 0\n", + "UNK_token = 1\n", + "\n", + "word_to_idx['PAD'] = PAD_token\n", + "word_to_idx['UNK'] = UNK_token\n", + "\n", + "idx_to_word[PAD_token] = 'PAD'\n", + "idx_to_word[UNK_token] = 'UNK'\n", + "\n", + "# We popullate our dictionaries with the most used words\n", + "for num,word in enumerate(most_used_words):\n", + " word_to_idx[word] = num + 2\n", + " idx_to_word[num+2] = word" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "kMHVkEisfjEC" + }, + "source": [ + "Our goal now is to transform each tweet from a sequence of tokens to a sequence of indexes. These sequences of indexes will be the input to our pytorch model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "tkCIu3PKfjED" + }, + "outputs": [], + "source": [ + "# A function to convert list of tokens to list of indexes\n", + "def tokens_to_idx(sentences_tokens,word_to_idx):\n", + " sentences_idx = []\n", + " for sent in sentences_tokens:\n", + " sent_idx = []\n", + " for word in sent:\n", + " if word in word_to_idx:\n", + " sent_idx.append(word_to_idx[word])\n", + " else:\n", + " sent_idx.append(word_to_idx['UNK'])\n", + " sentences_idx.append(sent_idx)\n", + " return sentences_idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "aHru4vpzfjED" + }, + "outputs": [], + "source": [ + "x_train_idx = tokens_to_idx(x_train_token,word_to_idx)\n", + "x_test_idx = tokens_to_idx(x_test_token,word_to_idx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ofj3OD7zfjED", + "outputId": "b2788d03-dbfa-41d7-8231-5011206baa59" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Before converting: ['worst', 'headache', 'ever']\n", + "After converting: [721, 458, 237]\n" + ] + } + ], + "source": [ + "some_number = 1\n", + "print('Before converting: ', x_train_token[some_number])\n", + "print('After converting: ', x_train_idx[some_number])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "NcCicvb-fjED" + }, + "source": [ + "We need all the sequences to have the same length. To select an adequate sequence length, let's explore some statistics about the length of the tweets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BSjhdyYUfjED", + "outputId": "82e49be9-7868-44ac-b496-c7a48da1efee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Max tweet word length: 229\n", + "Mean tweet word length: 15.0\n", + "99% percent under: 37.0\n" + ] + } + ], + "source": [ + "tweet_lens = np.asarray([len(sentence) for sentence in x_train_idx])\n", + "print('Max tweet word length: ',tweet_lens.max())\n", + "print('Mean tweet word length: ',np.median(tweet_lens))\n", + "print('99% percent under: ',np.quantile(tweet_lens,0.99))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "t311WY6ZfjEE" + }, + "source": [ + "We cut the sequences which are larger than our chosen maximum length (`max_lenght`) and fill with zeros the ones that are shorter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "r4S8KTWLfjEE" + }, + "outputs": [], + "source": [ + " # We choose the max length\n", + " max_length = 40\n", + "\n", + "# A function to make all the sequence have the same lenght\n", + "# Note that the output is a Numpy matrix\n", + " def padding(sentences, seq_len):\n", + " features = np.zeros((len(sentences), seq_len),dtype=int)\n", + " for ii, tweet in enumerate(sentences):\n", + " len_tweet = len(tweet)\n", + " if len_tweet != 0:\n", + " if len_tweet <= seq_len:\n", + " # If its shorter, we fill with zeros (the padding Token index)\n", + " features[ii, -len(tweet):] = np.array(tweet)[:seq_len]\n", + " if len_tweet > seq_len:\n", + " # If its larger, we take the last 'seq_len' indexes\n", + " features[ii, :] = np.array(tweet)[-seq_len:]\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "Z-Cw-bBxfjEE" + }, + "outputs": [], + "source": [ + "# We convert our list of tokens into a numpy matrix\n", + "# where all instances have the same lenght\n", + "x_train_pad = padding(x_train_idx,max_length)\n", + "x_test_pad = padding(x_test_idx,max_length)\n", + "\n", + "# We convert our target list a numpy matrix\n", + "y_train_np = np.asarray(y_train)\n", + "y_test_np = np.asarray(y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8eC3YswlfjEE", + "outputId": "3bb0ea7f-518f-4545-9241-feb783f48122" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Before padding: [1, 3, 71, 24, 122, 3, 533, 74, 13, 4, 3, 102, 13, 209, 2, 12, 150, 4, 22, 5, 18, 667, 3, 138, 61, 7, 3296, 4]\n", + "After padding: [ 0 0 0 0 0 0 0 0 0 0 0 0 1 3\n", + " 71 24 122 3 533 74 13 4 3 102 13 209 2 12\n", + " 150 4 22 5 18 667 3 138 61 7 3296 4]\n" + ] + } + ], + "source": [ + "some_number = 2\n", + "print('Before padding: ', x_train_idx[some_number])\n", + "print('After padding: ', x_train_pad[some_number])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "SzDhnauUfjEE" + }, + "source": [ + "Now, let's convert the data to pytorch format.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "--Yd22YWfjEF" + }, + "outputs": [], + "source": [ + "# create Tensor datasets\n", + "train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train_np))\n", + "valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test_np))\n", + "\n", + "# Batch size (this is an important hyperparameter)\n", + "batch_size = 100\n", + "\n", + "# dataloaders\n", + "# make sure to SHUFFLE your data\n", + "train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last = True)\n", + "valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "jQ5qPOWTfjEF" + }, + "source": [ + "Each batch of data in our traning proccess will have the folllowing format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "S1mqhk1hfjEF", + "outputId": "c97e7edd-695f-4336-a2e6-f6bed4852a63" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Sample input size: torch.Size([100, 40])\n", + "Sample input: \n", + " tensor([[ 0, 0, 0, ..., 32, 203, 86],\n", + " [ 0, 0, 0, ..., 1, 1, 4661],\n", + " [ 0, 0, 0, ..., 169, 43, 34],\n", + " ...,\n", + " [ 0, 0, 0, ..., 2, 2961, 4076],\n", + " [ 0, 0, 0, ..., 2319, 1325, 2],\n", + " [ 0, 0, 0, ..., 7, 253, 1]])\n", + "Sample input: \n", + " tensor([0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,\n", + " 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,\n", + " 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,\n", + " 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,\n", + " 0, 1, 0, 1])\n" + ] + } + ], + "source": [ + "# Obtain one batch of training data\n", + "dataiter = iter(train_loader)\n", + "sample_x, sample_y = dataiter.__next__()\n", + "\n", + "print('Sample input size: ', sample_x.size()) # batch_size, seq_length\n", + "print('Sample input: \\n', sample_x)\n", + "print('Sample input: \\n', sample_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "jn0PzZdGfjEF" + }, + "source": [ + "Now, we will define the `SentimentRNN` class. Most of the model's class will be familiar to you, but there are two important layers we would like you to pay attention to:\n", + "\n", + "* Embedding Layer\n", + "> This layer is like a linear layer, but it makes it posible to use a sequence of inedexes as inputs (instead of a sequence of one-hot-encoded vectors). During training, the Embedding layer learns a linear transformation from the space of words (a vector space of dimension `num_words_dict`) into the a new, smaller, vector space of dimension `embedding_dim`. We suggest you to read this [thread](https://discuss.pytorch.org/t/how-does-nn-embedding-work/88518/3) and the [pytorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) if you want to learn more about this particular kind of layers.\n", + "\n", + "\n", + "* LSTM layer\n", + "> This is one of the most used class of Recurrent Neural Networks. In Pytorch we can add several stacked layers in just one line of code. In our case, the number of layers added are decided with the parameter `no_layers`. If you want to learn more about LSTMs we strongly recommend you this [Colahs thread](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) about them.\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "vfzcowAxfjEF" + }, + "outputs": [], + "source": [ + "class SentimentRNN(nn.Module):\n", + " def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.1):\n", + " super(SentimentRNN,self).__init__()\n", + "\n", + " self.output_dim = output_dim\n", + " self.hidden_dim = hidden_dim\n", + " self.no_layers = no_layers\n", + " self.vocab_size = vocab_size\n", + " self.drop_prob = drop_prob\n", + "\n", + " # Embedding Layer\n", + " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", + "\n", + " # LSTM Layers\n", + " self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,\n", + " num_layers=no_layers, batch_first=True,\n", + " dropout=self.drop_prob)\n", + "\n", + " # Dropout layer\n", + " self.dropout = nn.Dropout(drop_prob)\n", + "\n", + " # Linear and Sigmoid layer\n", + " self.fc = nn.Linear(self.hidden_dim, output_dim)\n", + " self.sig = nn.Sigmoid()\n", + "\n", + " def forward(self,x,hidden):\n", + " batch_size = x.size(0)\n", + "\n", + " # Embedding out\n", + " embeds = self.embedding(x)\n", + " #Shape: [batch_size x max_length x embedding_dim]\n", + "\n", + " # LSTM out\n", + " lstm_out, hidden = self.lstm(embeds, hidden)\n", + " # Shape: [batch_size x max_length x hidden_dim]\n", + "\n", + " # Select the activation of the last Hidden Layer\n", + " lstm_out = lstm_out[:,-1,:].contiguous()\n", + " # Shape: [batch_size x hidden_dim]\n", + "\n", + " ## You can instead average the activations across all the times\n", + " # lstm_out = torch.mean(lstm_out, 1).contiguous()\n", + "\n", + " # Dropout and Fully connected layer\n", + " out = self.dropout(lstm_out)\n", + " out = self.fc(out)\n", + "\n", + " # Sigmoid function\n", + " sig_out = self.sig(out)\n", + "\n", + " # return last sigmoid output and hidden state\n", + " return sig_out, hidden\n", + "\n", + " def init_hidden(self, batch_size):\n", + " ''' Initializes hidden state '''\n", + " # Create two new tensors with sizes n_layers x batch_size x hidden_dim,\n", + " # initialized to zero, for hidden state and cell state of LSTM\n", + " h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n", + " c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n", + " hidden = (h0,c0)\n", + " return hidden" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "YfrLPa9mfjEF" + }, + "source": [ + "We choose the parameters of the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "rOm-xoFkfjEG" + }, + "outputs": [], + "source": [ + "# Parameters of our network\n", + "\n", + "# Size of our vocabulary\n", + "vocab_size = num_words_dict\n", + "\n", + "# Embedding dimension\n", + "embedding_dim = 32\n", + "\n", + "# Number of stacked LSTM layers\n", + "no_layers = 2\n", + "\n", + "# Dimension of the hidden layer in LSTMs\n", + "hidden_dim = 64\n", + "\n", + "# Dropout parameter for regularization\n", + "output_dim = 1\n", + "\n", + "# Dropout parameter for regularization\n", + "drop_prob = 0.25" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xapxpe7ufjEG", + "outputId": "51c90159-7d2b-4fc4-f34a-98e9901d40e4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "SentimentRNN(\n", + " (embedding): Embedding(30000, 32)\n", + " (lstm): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.25)\n", + " (dropout): Dropout(p=0.25, inplace=False)\n", + " (fc): Linear(in_features=64, out_features=1, bias=True)\n", + " (sig): Sigmoid()\n", + ")\n" + ] + } + ], + "source": [ + "# Let's define our model\n", + "model = SentimentRNN(no_layers, vocab_size, hidden_dim,\n", + " embedding_dim, drop_prob=drop_prob)\n", + "# Moving to gpu\n", + "model.to(device)\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GEvTs3uwfjEG", + "outputId": "2e15f6df-2aa8-4665-b2da-7363d2bfa09e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Total Number of parameters: 1018433\n" + ] + } + ], + "source": [ + "# How many trainable parameters does our model have?\n", + "model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n", + "params = sum([np.prod(p.size()) for p in model_parameters])\n", + "print('Total Number of parameters: ',params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "Pc2OC5GDfjEG" + }, + "source": [ + "We choose the losses and the optimizer for the training procces." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "id": "iBWjPADUfjEG" + }, + "outputs": [], + "source": [ + "# loss and optimization functions\n", + "lr = 0.001\n", + "\n", + "# Binary crossentropy is a good loss function for a binary classification problem\n", + "criterion = nn.BCELoss()\n", + "\n", + "# We choose an Adam optimizer\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", + "\n", + "# function to predict accuracy\n", + "def acc(pred,label):\n", + " pred = torch.round(pred.squeeze())\n", + " return torch.sum(pred == label.squeeze()).item()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "OZgMwOe2fjEG" + }, + "source": [ + "We are ready to train our model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3B6YhEocfjEH", + "outputId": "76276a1f-7775-4b98-aab0-0e199aa133e4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1\n", + "train_loss : 0.4366412344621494 val_loss : 0.3881208170717582\n", + "train_accuracy : 79.485546875 val_accuracy : 82.475\n", + "Validation loss decreased (inf --> 0.388121). Saving model ...\n", + "==================================================\n", + "Epoch 2\n", + "train_loss : 0.3760281792609021 val_loss : 0.3713956154882908\n", + "train_accuracy : 83.186328125 val_accuracy : 83.4575\n", + "Validation loss decreased (0.388121 --> 0.371396). Saving model ...\n", + "==================================================\n", + "Epoch 3\n", + "train_loss : 0.3574051411205437 val_loss : 0.36425656544510276\n", + "train_accuracy : 84.19953125 val_accuracy : 83.80375\n", + "Validation loss decreased (0.371396 --> 0.364257). Saving model ...\n", + "==================================================\n", + "Epoch 4\n", + "train_loss : 0.344456663565943 val_loss : 0.3613302929420024\n", + "train_accuracy : 84.89265625 val_accuracy : 84.00874999999999\n", + "Validation loss decreased (0.364257 --> 0.361330). Saving model ...\n", + "==================================================\n", + "Epoch 5\n", + "train_loss : 0.33407817618339325 val_loss : 0.3601334386831149\n", + "train_accuracy : 85.444921875 val_accuracy : 84.03625\n", + "Validation loss decreased (0.361330 --> 0.360133). Saving model ...\n", + "==================================================\n" + ] + } + ], + "source": [ + "# Number of training Epochs\n", + "epochs = 5\n", + "\n", + "# Maximum absolute value accepted for the gradeint\n", + "clip = 5\n", + "\n", + "# Initial Loss value (assumed big)\n", + "valid_loss_min = np.Inf\n", + "\n", + "# Lists to follow the evolution of the loss and accuracy\n", + "epoch_tr_loss,epoch_vl_loss = [],[]\n", + "epoch_tr_acc,epoch_vl_acc = [],[]\n", + "\n", + "# Train for a number of Epochs\n", + "for epoch in range(epochs):\n", + " train_losses = []\n", + " train_acc = 0.0\n", + " model.train()\n", + "\n", + " for inputs, labels in train_loader:\n", + "\n", + " # Initialize hidden state\n", + " h = model.init_hidden(batch_size)\n", + " # Creating new variables for the hidden state\n", + " h = tuple([each.data.to(device) for each in h])\n", + "\n", + " # Move batch inputs and labels to gpu\n", + " inputs, labels = inputs.to(device), labels.to(device)\n", + "\n", + " # Set gradient to zero\n", + " model.zero_grad()\n", + "\n", + " # Compute model output\n", + " output,h = model(inputs,h)\n", + "\n", + " # Calculate the loss and perform backprop\n", + " loss = criterion(output.squeeze(), labels.float())\n", + " loss.backward()\n", + " train_losses.append(loss.item())\n", + "\n", + " # calculating accuracy\n", + " accuracy = acc(output,labels)\n", + " train_acc += accuracy\n", + "\n", + " #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.\n", + " nn.utils.clip_grad_norm_(model.parameters(), clip)\n", + " optimizer.step()\n", + "\n", + "\n", + " # Evaluate on the validation set for this epoch\n", + " val_losses = []\n", + " val_acc = 0.0\n", + " model.eval()\n", + " for inputs, labels in valid_loader:\n", + "\n", + " # Initialize hidden state\n", + " val_h = model.init_hidden(batch_size)\n", + " val_h = tuple([each.data.to(device) for each in val_h])\n", + "\n", + " # Move batch inputs and labels to gpu\n", + " inputs, labels = inputs.to(device), labels.to(device)\n", + "\n", + " # Compute model output\n", + " output, val_h = model(inputs, val_h)\n", + "\n", + " # Compute Loss\n", + " val_loss = criterion(output.squeeze(), labels.float())\n", + "\n", + " val_losses.append(val_loss.item())\n", + "\n", + " accuracy = acc(output,labels)\n", + " val_acc += accuracy\n", + "\n", + " epoch_train_loss = np.mean(train_losses)\n", + " epoch_val_loss = np.mean(val_losses)\n", + " epoch_train_acc = train_acc/len(train_loader.dataset)\n", + " epoch_val_acc = val_acc/len(valid_loader.dataset)\n", + " epoch_tr_loss.append(epoch_train_loss)\n", + " epoch_vl_loss.append(epoch_val_loss)\n", + " epoch_tr_acc.append(epoch_train_acc)\n", + " epoch_vl_acc.append(epoch_val_acc)\n", + " print(f'Epoch {epoch+1}')\n", + " print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')\n", + " print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')\n", + " if epoch_val_loss <= valid_loss_min:\n", + " print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,epoch_val_loss))\n", + " # torch.save(model.state_dict(), '../working/state_dict.pt')\n", + " valid_loss_min = epoch_val_loss\n", + " print(25*'==')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {}, + "colab": { + "base_uri": "https://localhost:8080/", + "height": 364 + }, + "id": "ttJazP-nfjEH", + "outputId": "992bed02-611e-4614-c60f-77223d5b801a" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ], + "source": [ + "fig = plt.figure(figsize = (20, 6))\n", + "plt.subplot(1, 2, 1)\n", + "plt.plot(epoch_tr_acc, label='Train Acc')\n", + "plt.plot(epoch_vl_acc, label='Validation Acc')\n", + "plt.title(\"Accuracy\")\n", + "plt.legend()\n", + "plt.grid()\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "plt.plot(epoch_tr_loss, label='Train loss')\n", + "plt.plot(epoch_vl_loss, label='Validation loss')\n", + "plt.title(\"Loss\")\n", + "plt.legend()\n", + "plt.grid()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {}, + "id": "iUyaF-EbfjEH" + }, + "source": [ + "---\n", + "# What's Next?\n", + "\n", + "You can use this project template as a starting point to think about your own project. There are a lot of ways to continue, here we share with you some ideas you migth find useful:\n", + "\n", + "* **Work on the Preproccesing.** We used a very rudimentary way to tokenize tweets. But there are better ways to preprocess the data. Can you think of a suitable way to preprocess the data for this particular task? How does the performance of the model change when the data is processed correctly?\n", + "* **Work on the Model.** The RNN model proposed in this notebook is not optimized at all. You can work on finding a better architecture or better hyperparamenters. May be using bidirectonal LSTMs or increasing the number of stacked layers can improve the performance, feel free to try different approaches.\n", + "* **Work on the Embedding.** Our model learnt an embedding during the training on this Twitter corpus for a particular task. You can explore the representation of different words in this learned embedding. Also, you can try using different word embeddings. You can train them on this corpus or you can use an embedding trained on another corpus of data. How does the change of the embedding affect the model performance?\n", + "* **Try sentiment analysis on another dataset.** There are lots of available dataset to work with, we can help you find one that is interesting to you. Do you belive that a sentiment analysis model trained on some corpus (Twitter dataset) will perform well on another type of data (for example, youtube comments)?\n", + "\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "kernel": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "fbb4191426bd485e8e965b6d432eecae": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_df7eba182d1b4c21bc21d157eac6b996", + "IPY_MODEL_6d64402d9da74516ab4e1d46ae9f1ee3", + "IPY_MODEL_d9ca809f7b1c49e595a05458251f3ab2" + ], + "layout": "IPY_MODEL_90908b6f69524a72860214ef8bd2d946" + } + }, + "df7eba182d1b4c21bc21d157eac6b996": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_db432a2cd6244a7592fc9732f0ca4738", + "placeholder": "​", + "style": "IPY_MODEL_84485541f3a14c65a67d10a97b72bbad", + "value": "Downloading builder script: 100%" + } + }, + "6d64402d9da74516ab4e1d46ae9f1ee3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5fa7ab2ab2004e5cb692199e2bd27d6b", + "max": 4033, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ab71bd2b452146829e973d6cf99f31ed", + "value": 4033 + } + }, + "d9ca809f7b1c49e595a05458251f3ab2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_55ba92cfe0724286ac1c2bbe6577e5c8", + "placeholder": "​", + "style": "IPY_MODEL_67a4fa49ca5349d58512a16a3742d401", + "value": " 4.03k/4.03k [00:00<00:00, 114kB/s]" + } + }, + "90908b6f69524a72860214ef8bd2d946": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "db432a2cd6244a7592fc9732f0ca4738": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "84485541f3a14c65a67d10a97b72bbad": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5fa7ab2ab2004e5cb692199e2bd27d6b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ab71bd2b452146829e973d6cf99f31ed": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "55ba92cfe0724286ac1c2bbe6577e5c8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "67a4fa49ca5349d58512a16a3742d401": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "afd671543846468abfe37669a72845c3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_057e918ace004506aedc4e4b9942c3a8", + "IPY_MODEL_325387f6b62d47b0b21bea61676cea72", + "IPY_MODEL_ea1e3eb0e6ec4f8d82cf9b12cfe6e700" + ], + "layout": "IPY_MODEL_96c2d7ee644a438982e1792b7ec0453c" + } + }, + "057e918ace004506aedc4e4b9942c3a8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9baa1a735c0646b89953bf4a7c7fc92c", + "placeholder": "​", + "style": "IPY_MODEL_0ac9711c8ece4c5397a8cd810713adfb", + "value": "Downloading readme: 100%" + } + }, + "325387f6b62d47b0b21bea61676cea72": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a8d69769921241b8b1081e84f7770858", + "max": 6837, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d189f24b0e964d1a9fc86379bad38cca", + "value": 6837 + } + }, + "ea1e3eb0e6ec4f8d82cf9b12cfe6e700": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_db9bf44dec914db793cc4f73751c272c", + "placeholder": "​", + "style": "IPY_MODEL_1cf3ba0f756f4aa5ad1dcb675a791cfa", + "value": " 6.84k/6.84k [00:00<00:00, 157kB/s]" + } + }, + "96c2d7ee644a438982e1792b7ec0453c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9baa1a735c0646b89953bf4a7c7fc92c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0ac9711c8ece4c5397a8cd810713adfb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a8d69769921241b8b1081e84f7770858": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d189f24b0e964d1a9fc86379bad38cca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "db9bf44dec914db793cc4f73751c272c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1cf3ba0f756f4aa5ad1dcb675a791cfa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c432c4efcb794ce781fcb6f176f1b60d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_510eeffb32694e7798f23e3931d7a943", + "IPY_MODEL_a8b3dfaa2831416582d8eeef01451386", + "IPY_MODEL_db1cdafaf36f4c339476f3221abc17b3" + ], + "layout": "IPY_MODEL_ffd3778a96e046718828bbc5aa73f173" + } + }, + "510eeffb32694e7798f23e3931d7a943": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_49c5a3fbe87b491cb3f0f450a0af0659", + "placeholder": "​", + "style": "IPY_MODEL_252949e8784c4878a62eb2e30b1e3466", + "value": "Downloading data: 100%" + } + }, + "a8b3dfaa2831416582d8eeef01451386": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7bcef602e7f441308472bc145b12dcd3", + "max": 81363704, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_97fb30a5a31742efa1d188b9361f9938", + "value": 81363704 + } + }, + "db1cdafaf36f4c339476f3221abc17b3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9b34daddb9cc48bba109e547177ec654", + "placeholder": "​", + "style": "IPY_MODEL_fd2b5a6533794a2794579956c25247fb", + "value": " 81.4M/81.4M [00:06<00:00, 15.3MB/s]" + } + }, + "ffd3778a96e046718828bbc5aa73f173": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "49c5a3fbe87b491cb3f0f450a0af0659": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "252949e8784c4878a62eb2e30b1e3466": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7bcef602e7f441308472bc145b12dcd3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "97fb30a5a31742efa1d188b9361f9938": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9b34daddb9cc48bba109e547177ec654": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fd2b5a6533794a2794579956c25247fb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f3a9667c8c994324a2409f227bd0a1e9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6e6c5372ffe045c0b72587989567429e", + "IPY_MODEL_2ead0216695e4227aef44552f4ec3cc9", + "IPY_MODEL_53843f49adda4bce8450fd91fa9fd587" + ], + "layout": "IPY_MODEL_40262cb3eefa45fcbe37aaafccb69f5f" + } + }, + "6e6c5372ffe045c0b72587989567429e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b54b826314ea4b3a92eebd218c093fc1", + "placeholder": "​", + "style": "IPY_MODEL_8cd7be688b8c4818be48915db14a0792", + "value": "Generating train split: 100%" + } + }, + "2ead0216695e4227aef44552f4ec3cc9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a9a0f6ce71ed415c8c8513f68e34e162", + "max": 1600000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7f638a6deacd42e88c031fa47797516b", + "value": 1600000 + } + }, + "53843f49adda4bce8450fd91fa9fd587": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_849e39cc86f64e558ff94bf542a5121a", + "placeholder": "​", + "style": "IPY_MODEL_67b0b03c391c414bad5ea9fb3c947a2f", + "value": " 1600000/1600000 [01:18<00:00, 14710.70 examples/s]" + } + }, + "40262cb3eefa45fcbe37aaafccb69f5f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b54b826314ea4b3a92eebd218c093fc1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8cd7be688b8c4818be48915db14a0792": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a9a0f6ce71ed415c8c8513f68e34e162": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7f638a6deacd42e88c031fa47797516b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "849e39cc86f64e558ff94bf542a5121a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "67b0b03c391c414bad5ea9fb3c947a2f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1cef38981af6457dbaeb393f9936a389": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b0b5cfae51214c60bbca9a09b196c217", + "IPY_MODEL_5ee2a4b33be04c6db8ee4d7995c2376d", + "IPY_MODEL_403fffb635c2409ebeabc90063750ed3" + ], + "layout": "IPY_MODEL_6279343019064572adedf34cfbd437fa" + } + }, + "b0b5cfae51214c60bbca9a09b196c217": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2715d00db77545f9aa5eace8a0eb2839", + "placeholder": "​", + "style": "IPY_MODEL_942ce490d87347c789e229589b1b9c9f", + "value": "Generating test split: 100%" + } + }, + "5ee2a4b33be04c6db8ee4d7995c2376d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f04df4daeb6049ab85d3d75b472ccf6e", + "max": 498, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fd0b3c53b66543cea0c396d8047445a8", + "value": 498 + } + }, + "403fffb635c2409ebeabc90063750ed3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2c42e2fef6314c9e842a7e9641af3cab", + "placeholder": "​", + "style": "IPY_MODEL_913d95e58aa94e4a8009768a23fbf304", + "value": " 498/498 [00:00<00:00, 7393.07 examples/s]" + } + }, + "6279343019064572adedf34cfbd437fa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2715d00db77545f9aa5eace8a0eb2839": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "942ce490d87347c789e229589b1b9c9f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f04df4daeb6049ab85d3d75b472ccf6e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fd0b3c53b66543cea0c396d8047445a8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2c42e2fef6314c9e842a7e9641af3cab": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "913d95e58aa94e4a8009768a23fbf304": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e1348a02ceeb4af19fbd63d52b7d843b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_fbf51b14e6b34d0485ddf59c43d22c49", + "IPY_MODEL_c29e06a72ac9401b8c41f4195021071e", + "IPY_MODEL_48b812211db04284bfbbf02823fb879a" + ], + "layout": "IPY_MODEL_5455119809c74916acc50e1905903ded" + } + }, + "fbf51b14e6b34d0485ddf59c43d22c49": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2475bd62a3224bacb38a6334d07d6a8c", + "placeholder": "​", + "style": "IPY_MODEL_3d29947b5d2d4e2abc1355d900096642", + "value": "100%" + } + }, + "c29e06a72ac9401b8c41f4195021071e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3f7a8f56f15c434da70029366a37167a", + "max": 1280000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3610a2db297f4686bf9043f2b7ee55b5", + "value": 1280000 + } + }, + "48b812211db04284bfbbf02823fb879a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a1bd0616199e44538977ee2ea6049690", + "placeholder": "​", + "style": "IPY_MODEL_835fb9a91b34471fa6d61adf37616f52", + "value": " 1280000/1280000 [00:22<00:00, 77416.28it/s]" + } + }, + "5455119809c74916acc50e1905903ded": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2475bd62a3224bacb38a6334d07d6a8c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d29947b5d2d4e2abc1355d900096642": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3f7a8f56f15c434da70029366a37167a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3610a2db297f4686bf9043f2b7ee55b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a1bd0616199e44538977ee2ea6049690": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "835fb9a91b34471fa6d61adf37616f52": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d8de1a85076b453a92295e79110ba8fd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_78d48ee2fb9f42089f475fcf5fc368c8", + "IPY_MODEL_b0ca3012d0b84c5a9d7c1fc176251af7", + "IPY_MODEL_39fa73efcbf54d8dad225d8380061dbf" + ], + "layout": "IPY_MODEL_6b6cc35257fe433e93736d02e898b6b8" + } + }, + "78d48ee2fb9f42089f475fcf5fc368c8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0fc900d8b5940a6bd6a97e58adb4651", + "placeholder": "​", + "style": "IPY_MODEL_6b7286d74e0f4a0199dbfcaf3dd0d622", + "value": "100%" + } + }, + "b0ca3012d0b84c5a9d7c1fc176251af7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a4bbd3df99cd4acab5e1b3ba5cd7c114", + "max": 320000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9a7140a6197945d5bac5c48b820dfb04", + "value": 320000 + } + }, + "39fa73efcbf54d8dad225d8380061dbf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0bdc146792a64853ae06a9d185aa2b15", + "placeholder": "​", + "style": "IPY_MODEL_768da964ffcd44fea1af09e81f5621f3", + "value": " 320000/320000 [00:06<00:00, 58691.43it/s]" + } + }, + "6b6cc35257fe433e93736d02e898b6b8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0fc900d8b5940a6bd6a97e58adb4651": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6b7286d74e0f4a0199dbfcaf3dd0d622": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a4bbd3df99cd4acab5e1b3ba5cd7c114": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9a7140a6197945d5bac5c48b820dfb04": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0bdc146792a64853ae06a9d185aa2b15": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "768da964ffcd44fea1af09e81f5621f3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 6a02a9df36b49445964f0dbb9dfda74b8eed4cb4 Mon Sep 17 00:00:00 2001 From: dalia-nasr Date: Sun, 7 Jul 2024 11:40:43 +0300 Subject: [PATCH 07/12] reverted edit and replaced cells with required changes --- .../sentiment_analysis.ipynb | 6301 +++++------------ 1 file changed, 1666 insertions(+), 4635 deletions(-) diff --git a/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb b/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb index eb393529a..87c305bde 100644 --- a/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb +++ b/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb @@ -1,4640 +1,1671 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "view-in-github" - }, - "source": [ - "\"Open   \"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "D_fgc45VfjDz" - }, - "source": [ - "# Twitter Sentiment Analysis\n", - "\n", - "**By Neuromatch Academy**\n", - "\n", - "__Content creators:__ Juan Manuel Rodriguez, Salomey Osei, Gonzalo Uribarri\n", - "\n", - "__Production editors:__ Amita Kapoor, Spiros Chavlis" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "axvz0SUsfjD0" - }, - "source": [ - "---\n", - "# Welcome to the NLP project template\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "2Vfm0ThbfjD1" - }, - "source": [ - "---\n", - "# Step 1: Questions and goals\n", - "\n", - "* Can we infer emotion from a tweet text?\n", - "* How words are distributed accross the dataset?\n", - "* Are words related to one kind of emotion?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "Vd1qdNW9fjD1" - }, - "source": [ - "---\n", - "# Step 2: Literature review\n", - "\n", - "[Original Dataset Paper](https://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf)\n", - "\n", - "[Papers with code](https://paperswithcode.com/dataset/imdb-movie-reviews)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "oOYDQElpfjD2" - }, - "source": [ - "---\n", - "# Step 3: Load and explore the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EZpxSExUfjD2", - "outputId": "19b01445-9b83-4a93-9cc2-7830ab0dcf5b" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m60.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.8/547.8 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 MB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.9/64.9 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m20.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 which is incompatible.\n", - "google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.\n", - "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 16.1.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "# @title Install dependencies\n", - "!pip install pandas --quiet\n", - "!pip install torchtext --quiet\n", - "!pip install datasets --quiet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "DxqD3Tk5fjD3", - "outputId": "451d68c5-7894-4f93-9f54-bf0b7f482e20" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.10/dist-packages/torchtext/data/__init__.py:4: UserWarning: \n", - "/!\\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\\ \n", - "Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()`\n", - " warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)\n" - ] - } - ], - "source": [ - "# We import some libraries to load the dataset\n", - "import os\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from datasets import load_dataset\n", - "\n", - "from collections import Counter\n", - "from tqdm.notebook import tqdm\n", - "\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import TensorDataset, DataLoader\n", - "\n", - "import torchtext\n", - "from torchtext.data import get_tokenizer\n", - "\n", - "from sklearn.utils import shuffle\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.feature_extraction.text import CountVectorizer" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "63Eg1SLbfjD4" - }, - "source": [ - "You can find the dataset we are going to use in [this website](http://help.sentiment140.com/for-students/)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/", - "height": 567, - "referenced_widgets": [ - "fbb4191426bd485e8e965b6d432eecae", - "df7eba182d1b4c21bc21d157eac6b996", - "6d64402d9da74516ab4e1d46ae9f1ee3", - "d9ca809f7b1c49e595a05458251f3ab2", - "90908b6f69524a72860214ef8bd2d946", - "db432a2cd6244a7592fc9732f0ca4738", - "84485541f3a14c65a67d10a97b72bbad", - "5fa7ab2ab2004e5cb692199e2bd27d6b", - "ab71bd2b452146829e973d6cf99f31ed", - "55ba92cfe0724286ac1c2bbe6577e5c8", - "67a4fa49ca5349d58512a16a3742d401", - "afd671543846468abfe37669a72845c3", - "057e918ace004506aedc4e4b9942c3a8", - "325387f6b62d47b0b21bea61676cea72", - "ea1e3eb0e6ec4f8d82cf9b12cfe6e700", - "96c2d7ee644a438982e1792b7ec0453c", - "9baa1a735c0646b89953bf4a7c7fc92c", - "0ac9711c8ece4c5397a8cd810713adfb", - "a8d69769921241b8b1081e84f7770858", - "d189f24b0e964d1a9fc86379bad38cca", - "db9bf44dec914db793cc4f73751c272c", - "1cf3ba0f756f4aa5ad1dcb675a791cfa", - "c432c4efcb794ce781fcb6f176f1b60d", - "510eeffb32694e7798f23e3931d7a943", - "a8b3dfaa2831416582d8eeef01451386", - "db1cdafaf36f4c339476f3221abc17b3", - "ffd3778a96e046718828bbc5aa73f173", - "49c5a3fbe87b491cb3f0f450a0af0659", - "252949e8784c4878a62eb2e30b1e3466", - "7bcef602e7f441308472bc145b12dcd3", - "97fb30a5a31742efa1d188b9361f9938", - "9b34daddb9cc48bba109e547177ec654", - "fd2b5a6533794a2794579956c25247fb", - "f3a9667c8c994324a2409f227bd0a1e9", - "6e6c5372ffe045c0b72587989567429e", - "2ead0216695e4227aef44552f4ec3cc9", - "53843f49adda4bce8450fd91fa9fd587", - "40262cb3eefa45fcbe37aaafccb69f5f", - "b54b826314ea4b3a92eebd218c093fc1", - "8cd7be688b8c4818be48915db14a0792", - "a9a0f6ce71ed415c8c8513f68e34e162", - "7f638a6deacd42e88c031fa47797516b", - "849e39cc86f64e558ff94bf542a5121a", - "67b0b03c391c414bad5ea9fb3c947a2f", - "1cef38981af6457dbaeb393f9936a389", - "b0b5cfae51214c60bbca9a09b196c217", - "5ee2a4b33be04c6db8ee4d7995c2376d", - "403fffb635c2409ebeabc90063750ed3", - "6279343019064572adedf34cfbd437fa", - "2715d00db77545f9aa5eace8a0eb2839", - "942ce490d87347c789e229589b1b9c9f", - "f04df4daeb6049ab85d3d75b472ccf6e", - "fd0b3c53b66543cea0c396d8047445a8", - "2c42e2fef6314c9e842a7e9641af3cab", - "913d95e58aa94e4a8009768a23fbf304" - ] - }, - "id": "3HLOsd3rfjD4", - "outputId": "7653fee1-a871-472b-a978-d8ec0250dc84" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Downloading builder script: 0%| | 0.00/4.03k [00:00\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
polarityuserdatequeryusertext
00_TheSpecialOne_Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
10scotthamiltonMon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
20mattycusMon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
30ElleCTFMon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
40KaroliMon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - " \n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "df" - } - }, - "metadata": {}, - "execution_count": 3 - } - ], - "source": [ - "# We load the dataset\n", - "\n", - "dataset = load_dataset(\"stanfordnlp/sentiment140\", trust_remote_code= True)\n", - "\n", - "train_data = dataset[\"train\"]\n", - "df = pd.DataFrame(train_data)\n", - "df = df.rename(columns={'sentiment': 'polarity'})\n", - "df = df[['polarity', 'user', 'date', 'query', 'user', 'text']]\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "fuKShcfjfjD4" - }, - "source": [ - "For this project we will use only the text and the polarity of the tweet. Notice that polarity is 0 for negative tweets and 4 for positive tweet." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "GXHQOn6gfjD5" - }, - "outputs": [], - "source": [ - "X = df.text.values\n", - "\n", - "# Changes values from [0,4] to [0,1]\n", - "y = (df.polarity.values > 1).astype(int)\n", - "\n", - "\n", - "# Split the data into train and test\n", - "x_train_text, x_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "7kr3TO_LfjD5" - }, - "source": [ - "The first thing we have to do before working on the models is to familiarize ourselves with the dataset. This is called Exploratory Data Analisys (EDA)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FsL-xY03fjD5", - "outputId": "655f0ef8-c177-4f42-c024-1d628241401a" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "1: @paisleypaisley LOL why do i get ideas so far in advance? it's not even june yet! we need a third knitter to have our own summer group \n", - "0: worst headache ever \n", - "0: @ewaniesciuszko i am so sad i wont see you! I miss you already. and yeah! that's perfect; i come back the 18th!\n", - "1: doesn't know how to spell conked \n", - "0: "So we stand here now and no one knows us at all I won't get used to this I won't get used to being gone"...I miss home and everyone -a\n" - ] - } - ], - "source": [ - "for s, l in zip(x_train_text[:5], y_train[:5]):\n", - " print('{}: {}'.format(l, s))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "4cPGXSc-fjD5" - }, - "source": [ - "An interesting thing to analyze is the Word Distribution. In order to count the occurrences of each word, we should tokenize the sentences first." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "U1OugpZ0fjD5", - "outputId": "9e6cb4e3-8d8c-4db0-c113-bdd4fe87db5f" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Before Tokenize: worst headache ever \n", - "After Tokenize: ['worst', 'headache', 'ever']\n" - ] - } - ], - "source": [ - "tokenizer = get_tokenizer(\"basic_english\")\n", - "\n", - "print('Before Tokenize: ', x_train_text[1])\n", - "print('After Tokenize: ', tokenizer(x_train_text[1]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81, - "referenced_widgets": [ - "e1348a02ceeb4af19fbd63d52b7d843b", - "fbf51b14e6b34d0485ddf59c43d22c49", - "c29e06a72ac9401b8c41f4195021071e", - "48b812211db04284bfbbf02823fb879a", - "5455119809c74916acc50e1905903ded", - "2475bd62a3224bacb38a6334d07d6a8c", - "3d29947b5d2d4e2abc1355d900096642", - "3f7a8f56f15c434da70029366a37167a", - "3610a2db297f4686bf9043f2b7ee55b5", - "a1bd0616199e44538977ee2ea6049690", - "835fb9a91b34471fa6d61adf37616f52", - "d8de1a85076b453a92295e79110ba8fd", - "78d48ee2fb9f42089f475fcf5fc368c8", - "b0ca3012d0b84c5a9d7c1fc176251af7", - "39fa73efcbf54d8dad225d8380061dbf", - "6b6cc35257fe433e93736d02e898b6b8", - "e0fc900d8b5940a6bd6a97e58adb4651", - "6b7286d74e0f4a0199dbfcaf3dd0d622", - "a4bbd3df99cd4acab5e1b3ba5cd7c114", - "9a7140a6197945d5bac5c48b820dfb04", - "0bdc146792a64853ae06a9d185aa2b15", - "768da964ffcd44fea1af09e81f5621f3" - ] - }, - "id": "7ZggzGCXfjD6", - "outputId": "ae19f8d6-224d-4224-d3a0-d00c659ec9b2" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - " 0%| | 0/1280000 [00:00" - ], - "image/png": "\n" - }, - "metadata": {} - } - ], - "source": [ - "plt.bar(range(100), [words[w] for w in sorted_words[:100]])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "o9IYA0cZfjD7" - }, - "source": [ - "It is very common to find this kind of distribution when analyzing corpus of text. This is referred to as the [zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "5FQIOqoRfjD7" - }, - "source": [ - "Usually the number of words in the dictionary will be very large.\n", - "\n", - "Here are some thing we can do to reduce that number:\n", - "\n", - "* Remove puntuation.\n", - "* Remove stop-words.\n", - "* Steaming.\n", - "* Remove very uncommon words (the words that appears in fewer than N occations).\n", - "* Nothing: we can use a pretrain model that handles this kind of situations.\n", - "\n", - "\n", - "We used one of the simplest tokenizers availables. This tokenizer does not take into account many quirks of the language. Moreover, diferent languages have different quirks, so there is no \"universal\" tokenizers. There are many libraries that have \"better\" tokenizers:\n", - "\n", - "* [Spacy](https://spacy.io/): it can be accessed using: `get_tokenizer(\"spacy\")`. Spacy supports a wide range of languages.\n", - "* [Huggingface](https://huggingface.co/): it has many tokenizers for different laguages. [Doc](https://huggingface.co/transformers/main_classes/tokenizer.html)\n", - "* [NLTK](https://www.nltk.org/): it provides several tokenizers. One of them can be accessed using: `get_tokenizer(\"toktok\")`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "_ul5MgYcfjD7" - }, - "source": [ - "---\n", - "# Step 4: choose toolkit\n", - "\n", - "Our goal is to train a model capable of estimating the sentiment of a tweet (positive or negative) by reading its content. To that end we will try 2 different approaches:\n", - "\n", - "* A logistic regression using sklearn. **NOTE**: it can probaly work better than an SVM model.\n", - "* A simple Embedding + RNN." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "GteI1PxTfjD7" - }, - "source": [ - "## Logistic regression\n", - "\n", - "We will represent our senteces using binary vectorization. This means that our data would be represented as a matrix of instances by word with a one if the word is in the instance, and zero otherwise. Sklean vectorizers can also do things such as stop-word removal and puntuation removal, you can read more about in [the documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "S_ei2qu8fjD7" - }, - "outputs": [], - "source": [ - "vectorizer = CountVectorizer(binary=True)\n", - "x_train_cv = vectorizer.fit_transform(x_train_text)\n", - "x_test_cv = vectorizer.transform(x_test_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iK_zfqnLfjD7", - "outputId": "9b3f6db3-01bf-4246-b943-359620c717a2" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Before Vectorize: doesn't know how to spell conked \n" - ] - } - ], - "source": [ - "print('Before Vectorize: ', x_train_text[3])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "wKxY8e38fjD8", - "outputId": "19530135-070d-4259-d6a9-7ba06b519763" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "After Vectorize: \n", - " (0, 528584)\t1\n", - " (0, 165468)\t1\n", - " (0, 300381)\t1\n", - " (0, 242211)\t1\n", - " (0, 489893)\t1\n", - " (0, 134160)\t1\n" - ] - } - ], - "source": [ - "# Notice that the matriz is sparse\n", - "print('After Vectorize: ')\n", - "print(x_train_cv[3])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "QTPPEMd9fjD8" - }, - "source": [ - "Now we can train our model. You can check the documentation of this logistic regressor [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic#sklearn.linear_model.LogisticRegression)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/", - "height": 74 - }, - "id": "2vEPOQS6fjD8", - "outputId": "3be77fc0-76e6-40b8-8847-5f6e7c6c0ce0" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "LogisticRegression(solver='saga')" - ], - "text/html": [ - "
LogisticRegression(solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ] - }, - "metadata": {}, - "execution_count": 14 - } - ], - "source": [ - "model = LogisticRegression(solver='saga')\n", - "model.fit(x_train_cv, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "37bUbqB6fjD8", - "outputId": "7eb9178d-6130-47d0-bdf4-ce4be164bc97" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.81 0.79 0.80 160000\n", - " 1 0.79 0.81 0.80 160000\n", - "\n", - " accuracy 0.80 320000\n", - " macro avg 0.80 0.80 0.80 320000\n", - "weighted avg 0.80 0.80 0.80 320000\n", - "\n" - ] - } - ], - "source": [ - "y_pred = model.predict(x_test_cv)\n", - "\n", - "print(classification_report(y_test, y_pred))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "161kDLhofjD8" - }, - "source": [ - "## Explainable AI\n", - "The best thing about logistic regresion is that it is simple, and we can get some explanations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EILTmxzifjD9", - "outputId": "b7ce6853-7385-4a24-d4eb-e6d0843ca5d5" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "(1, 589260)\n", - "589260\n" - ] - } - ], - "source": [ - "print(model.coef_.shape)\n", - "print(len(vectorizer.vocabulary_))\n", - "\n", - "words_sk = list(vectorizer.vocabulary_.keys())\n", - "words_sk.sort(key=lambda w: model.coef_[0, vectorizer.vocabulary_[w]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "NGjVPON6fjD9", - "outputId": "d40443bc-476d-4f5a-ce90-4b5b17e47933" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "roni: -3.8625952420933984\n", - "inaperfectworld: -3.5734321547933936\n", - "dontyouhate: -3.5002133484207576\n", - "xbllygbsn: -3.4126303898325787\n", - "anqju: -3.3363997631497493\n", - "sad: -3.200516823534637\n", - "pakcricket: -3.1949062976331675\n", - "condolences: -3.132503698316079\n", - "heartbreaking: -3.0665219866881297\n", - "saddest: -3.042020604188048\n", - "sadd: -3.029036146667248\n", - "heartbroken: -3.0287524416643463\n", - "boohoo: -3.0226033087262802\n", - "sadface: -2.991829110065316\n", - "rachelle_lefevr: -2.925076661509848\n", - "disappointing: -2.902522686643491\n", - "lvbu: -2.8947109582208865\n", - "saddens: -2.8855187276040715\n", - "bummed: -2.836500453805889\n", - "neda: -2.792917726280752\n" - ] - } - ], - "source": [ - "for w in words_sk[:20]:\n", - " print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oxQ_jlNRfjD9", - "outputId": "363de58b-817a-4205-f019-2379d0d64e0d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "iamsoannoyed: 2.8493838469077013\n", - "myfax: 2.7974330510971424\n", - "jennamadison: 2.5667217237933104\n", - "yeyy: 2.4780234846131646\n", - "tryout: 2.438315611477797\n", - "goldymom: 2.4374072779309204\n", - "wooohooo: 2.402957513257194\n", - "thesupergirl: 2.356525094856456\n", - "iammaxathotspot: 2.3116551216589682\n", - "londicreations: 2.3074264075299316\n", - "smilin: 2.2991796213822497\n", - "worries: 2.2899555142510084\n", - "sinfulsignorita: 2.27989578448778\n", - "finchensnail: 2.2642827277181063\n", - "smackthis: 2.237672991997692\n", - "kv: 2.2157591386122775\n", - "tojosan: 2.2117938132889696\n", - "russmarshalek: 2.20953890861265\n", - "traciknoppe: 2.1768232307222153\n", - "congratulations: 2.1715901103136876\n" - ] - } - ], - "source": [ - "for w in reversed(words_sk[-20:]):\n", - " print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "9KSSAC3qfjD9" - }, - "source": [ - "What does this mean?\n", - "\n", - "Remember the `model.coef_` is the $W$ in:\n", - "\n", - "$$h(x)=\\sigma(WX + b)$$\n", - "\n", - "where the label 1 is a positive tweet and the label 0 is a negative tweet." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "oDHjTP2_fjD9" - }, - "source": [ - "## Recurrent Neural Network with Pytorch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "TbgpKy95fjD9" - }, - "source": [ - "In the previous section we use a Bag-Of-Words approach to represent each of the tweets. That meas that we only consider how many times each of the words appear in each of the tweets, we didnt take into account the order of the words. But we know that the word order is very important and carries relevant information.\n", - "\n", - "In this section we will solve the same task, but this time we will implement a Recurrent Neural Network (RNN) instead of using a simple Logistic Regression.Unlike feedforward neural networks, RNNs have cyclic connections making them powerful for modeling sequences.\n", - "\n", - "Let's start by importing the relevant libraries.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "7nmUJV99fjEB" - }, - "outputs": [], - "source": [ - "def set_device():\n", - " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - " if device != \"cuda\":\n", - " print(\"WARNING: For this notebook to perform best, \"\n", - " \"if possible, in the menu under `Runtime` -> \"\n", - " \"`Change runtime type.` select `GPU` \")\n", - " else:\n", - " print(\"GPU is enabled in this notebook.\")\n", - "\n", - " return device" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "chI-18LcfjEB", - "outputId": "7f633079-6548-48f3-802e-94bc9cfada93" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "GPU is enabled in this notebook.\n" - ] - } - ], - "source": [ - "# Set the device (check if gpu is available)\n", - "device = set_device()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "01UtIN7ofjEC" - }, - "source": [ - "First we will create a Dictionary (`word_to_idx`). This dictionary will map each Token (usually words) to an index (an integer number). We want to limit our dictionary to a certain number of tokens (`num_words_dict`), so we will include in our ditionary those with more occurrences." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "afus9SyUfjEC", - "outputId": "bb4eb869-e2f0-4ccd-f64c-e55908272345" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['.', 'i', '!', \"'\", 'to', 'the', ',', 'a', 'my', 'it']" - ] - }, - "metadata": {}, - "execution_count": 21 - } - ], - "source": [ - "# From previous section, we have a list with the most used tokens\n", - "sorted_words[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "6vfQFjaufjEC" - }, - "source": [ - "Let's select only the most used." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "tGLkxaGcfjEC" - }, - "outputs": [], - "source": [ - "num_words_dict = 30000\n", - "# We reserve two numbers for special tokens.\n", - "most_used_words = sorted_words[:num_words_dict-2]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "AzhQvekCfjEC" - }, - "source": [ - "We will add two extra Tokens to the dictionary, one for words outside the dictionary (`'UNK'`) and one for padding the sequences (`'PAD'`)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "73Wrb-lEfjEC" - }, - "outputs": [], - "source": [ - "# dictionary to go from words to idx\n", - "word_to_idx = {}\n", - "# dictionary to go from idx to words (just in case)\n", - "idx_to_word = {}\n", - "\n", - "\n", - "# We include the special tokens first\n", - "PAD_token = 0\n", - "UNK_token = 1\n", - "\n", - "word_to_idx['PAD'] = PAD_token\n", - "word_to_idx['UNK'] = UNK_token\n", - "\n", - "idx_to_word[PAD_token] = 'PAD'\n", - "idx_to_word[UNK_token] = 'UNK'\n", - "\n", - "# We popullate our dictionaries with the most used words\n", - "for num,word in enumerate(most_used_words):\n", - " word_to_idx[word] = num + 2\n", - " idx_to_word[num+2] = word" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "kMHVkEisfjEC" - }, - "source": [ - "Our goal now is to transform each tweet from a sequence of tokens to a sequence of indexes. These sequences of indexes will be the input to our pytorch model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "tkCIu3PKfjED" - }, - "outputs": [], - "source": [ - "# A function to convert list of tokens to list of indexes\n", - "def tokens_to_idx(sentences_tokens,word_to_idx):\n", - " sentences_idx = []\n", - " for sent in sentences_tokens:\n", - " sent_idx = []\n", - " for word in sent:\n", - " if word in word_to_idx:\n", - " sent_idx.append(word_to_idx[word])\n", - " else:\n", - " sent_idx.append(word_to_idx['UNK'])\n", - " sentences_idx.append(sent_idx)\n", - " return sentences_idx" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "aHru4vpzfjED" - }, - "outputs": [], - "source": [ - "x_train_idx = tokens_to_idx(x_train_token,word_to_idx)\n", - "x_test_idx = tokens_to_idx(x_test_token,word_to_idx)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ofj3OD7zfjED", - "outputId": "b2788d03-dbfa-41d7-8231-5011206baa59" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Before converting: ['worst', 'headache', 'ever']\n", - "After converting: [721, 458, 237]\n" - ] - } - ], - "source": [ - "some_number = 1\n", - "print('Before converting: ', x_train_token[some_number])\n", - "print('After converting: ', x_train_idx[some_number])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "NcCicvb-fjED" - }, - "source": [ - "We need all the sequences to have the same length. To select an adequate sequence length, let's explore some statistics about the length of the tweets:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "BSjhdyYUfjED", - "outputId": "82e49be9-7868-44ac-b496-c7a48da1efee" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Max tweet word length: 229\n", - "Mean tweet word length: 15.0\n", - "99% percent under: 37.0\n" - ] - } - ], - "source": [ - "tweet_lens = np.asarray([len(sentence) for sentence in x_train_idx])\n", - "print('Max tweet word length: ',tweet_lens.max())\n", - "print('Mean tweet word length: ',np.median(tweet_lens))\n", - "print('99% percent under: ',np.quantile(tweet_lens,0.99))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "t311WY6ZfjEE" - }, - "source": [ - "We cut the sequences which are larger than our chosen maximum length (`max_lenght`) and fill with zeros the ones that are shorter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "r4S8KTWLfjEE" - }, - "outputs": [], - "source": [ - " # We choose the max length\n", - " max_length = 40\n", - "\n", - "# A function to make all the sequence have the same lenght\n", - "# Note that the output is a Numpy matrix\n", - " def padding(sentences, seq_len):\n", - " features = np.zeros((len(sentences), seq_len),dtype=int)\n", - " for ii, tweet in enumerate(sentences):\n", - " len_tweet = len(tweet)\n", - " if len_tweet != 0:\n", - " if len_tweet <= seq_len:\n", - " # If its shorter, we fill with zeros (the padding Token index)\n", - " features[ii, -len(tweet):] = np.array(tweet)[:seq_len]\n", - " if len_tweet > seq_len:\n", - " # If its larger, we take the last 'seq_len' indexes\n", - " features[ii, :] = np.array(tweet)[-seq_len:]\n", - " return features" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "Z-Cw-bBxfjEE" - }, - "outputs": [], - "source": [ - "# We convert our list of tokens into a numpy matrix\n", - "# where all instances have the same lenght\n", - "x_train_pad = padding(x_train_idx,max_length)\n", - "x_test_pad = padding(x_test_idx,max_length)\n", - "\n", - "# We convert our target list a numpy matrix\n", - "y_train_np = np.asarray(y_train)\n", - "y_test_np = np.asarray(y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8eC3YswlfjEE", - "outputId": "3bb0ea7f-518f-4545-9241-feb783f48122" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Before padding: [1, 3, 71, 24, 122, 3, 533, 74, 13, 4, 3, 102, 13, 209, 2, 12, 150, 4, 22, 5, 18, 667, 3, 138, 61, 7, 3296, 4]\n", - "After padding: [ 0 0 0 0 0 0 0 0 0 0 0 0 1 3\n", - " 71 24 122 3 533 74 13 4 3 102 13 209 2 12\n", - " 150 4 22 5 18 667 3 138 61 7 3296 4]\n" - ] - } - ], - "source": [ - "some_number = 2\n", - "print('Before padding: ', x_train_idx[some_number])\n", - "print('After padding: ', x_train_pad[some_number])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "SzDhnauUfjEE" - }, - "source": [ - "Now, let's convert the data to pytorch format.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "--Yd22YWfjEF" - }, - "outputs": [], - "source": [ - "# create Tensor datasets\n", - "train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train_np))\n", - "valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test_np))\n", - "\n", - "# Batch size (this is an important hyperparameter)\n", - "batch_size = 100\n", - "\n", - "# dataloaders\n", - "# make sure to SHUFFLE your data\n", - "train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last = True)\n", - "valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "jQ5qPOWTfjEF" - }, - "source": [ - "Each batch of data in our traning proccess will have the folllowing format:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "S1mqhk1hfjEF", - "outputId": "c97e7edd-695f-4336-a2e6-f6bed4852a63" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Sample input size: torch.Size([100, 40])\n", - "Sample input: \n", - " tensor([[ 0, 0, 0, ..., 32, 203, 86],\n", - " [ 0, 0, 0, ..., 1, 1, 4661],\n", - " [ 0, 0, 0, ..., 169, 43, 34],\n", - " ...,\n", - " [ 0, 0, 0, ..., 2, 2961, 4076],\n", - " [ 0, 0, 0, ..., 2319, 1325, 2],\n", - " [ 0, 0, 0, ..., 7, 253, 1]])\n", - "Sample input: \n", - " tensor([0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,\n", - " 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,\n", - " 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,\n", - " 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,\n", - " 0, 1, 0, 1])\n" - ] - } - ], - "source": [ - "# Obtain one batch of training data\n", - "dataiter = iter(train_loader)\n", - "sample_x, sample_y = dataiter.__next__()\n", - "\n", - "print('Sample input size: ', sample_x.size()) # batch_size, seq_length\n", - "print('Sample input: \\n', sample_x)\n", - "print('Sample input: \\n', sample_y)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "jn0PzZdGfjEF" - }, - "source": [ - "Now, we will define the `SentimentRNN` class. Most of the model's class will be familiar to you, but there are two important layers we would like you to pay attention to:\n", - "\n", - "* Embedding Layer\n", - "> This layer is like a linear layer, but it makes it posible to use a sequence of inedexes as inputs (instead of a sequence of one-hot-encoded vectors). During training, the Embedding layer learns a linear transformation from the space of words (a vector space of dimension `num_words_dict`) into the a new, smaller, vector space of dimension `embedding_dim`. We suggest you to read this [thread](https://discuss.pytorch.org/t/how-does-nn-embedding-work/88518/3) and the [pytorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) if you want to learn more about this particular kind of layers.\n", - "\n", - "\n", - "* LSTM layer\n", - "> This is one of the most used class of Recurrent Neural Networks. In Pytorch we can add several stacked layers in just one line of code. In our case, the number of layers added are decided with the parameter `no_layers`. If you want to learn more about LSTMs we strongly recommend you this [Colahs thread](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) about them.\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "vfzcowAxfjEF" - }, - "outputs": [], - "source": [ - "class SentimentRNN(nn.Module):\n", - " def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.1):\n", - " super(SentimentRNN,self).__init__()\n", - "\n", - " self.output_dim = output_dim\n", - " self.hidden_dim = hidden_dim\n", - " self.no_layers = no_layers\n", - " self.vocab_size = vocab_size\n", - " self.drop_prob = drop_prob\n", - "\n", - " # Embedding Layer\n", - " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", - "\n", - " # LSTM Layers\n", - " self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,\n", - " num_layers=no_layers, batch_first=True,\n", - " dropout=self.drop_prob)\n", - "\n", - " # Dropout layer\n", - " self.dropout = nn.Dropout(drop_prob)\n", - "\n", - " # Linear and Sigmoid layer\n", - " self.fc = nn.Linear(self.hidden_dim, output_dim)\n", - " self.sig = nn.Sigmoid()\n", - "\n", - " def forward(self,x,hidden):\n", - " batch_size = x.size(0)\n", - "\n", - " # Embedding out\n", - " embeds = self.embedding(x)\n", - " #Shape: [batch_size x max_length x embedding_dim]\n", - "\n", - " # LSTM out\n", - " lstm_out, hidden = self.lstm(embeds, hidden)\n", - " # Shape: [batch_size x max_length x hidden_dim]\n", - "\n", - " # Select the activation of the last Hidden Layer\n", - " lstm_out = lstm_out[:,-1,:].contiguous()\n", - " # Shape: [batch_size x hidden_dim]\n", - "\n", - " ## You can instead average the activations across all the times\n", - " # lstm_out = torch.mean(lstm_out, 1).contiguous()\n", - "\n", - " # Dropout and Fully connected layer\n", - " out = self.dropout(lstm_out)\n", - " out = self.fc(out)\n", - "\n", - " # Sigmoid function\n", - " sig_out = self.sig(out)\n", - "\n", - " # return last sigmoid output and hidden state\n", - " return sig_out, hidden\n", - "\n", - " def init_hidden(self, batch_size):\n", - " ''' Initializes hidden state '''\n", - " # Create two new tensors with sizes n_layers x batch_size x hidden_dim,\n", - " # initialized to zero, for hidden state and cell state of LSTM\n", - " h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n", - " c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n", - " hidden = (h0,c0)\n", - " return hidden" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "YfrLPa9mfjEF" - }, - "source": [ - "We choose the parameters of the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "rOm-xoFkfjEG" - }, - "outputs": [], - "source": [ - "# Parameters of our network\n", - "\n", - "# Size of our vocabulary\n", - "vocab_size = num_words_dict\n", - "\n", - "# Embedding dimension\n", - "embedding_dim = 32\n", - "\n", - "# Number of stacked LSTM layers\n", - "no_layers = 2\n", - "\n", - "# Dimension of the hidden layer in LSTMs\n", - "hidden_dim = 64\n", - "\n", - "# Dropout parameter for regularization\n", - "output_dim = 1\n", - "\n", - "# Dropout parameter for regularization\n", - "drop_prob = 0.25" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "xapxpe7ufjEG", - "outputId": "51c90159-7d2b-4fc4-f34a-98e9901d40e4" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "SentimentRNN(\n", - " (embedding): Embedding(30000, 32)\n", - " (lstm): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.25)\n", - " (dropout): Dropout(p=0.25, inplace=False)\n", - " (fc): Linear(in_features=64, out_features=1, bias=True)\n", - " (sig): Sigmoid()\n", - ")\n" - ] - } - ], - "source": [ - "# Let's define our model\n", - "model = SentimentRNN(no_layers, vocab_size, hidden_dim,\n", - " embedding_dim, drop_prob=drop_prob)\n", - "# Moving to gpu\n", - "model.to(device)\n", - "print(model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GEvTs3uwfjEG", - "outputId": "2e15f6df-2aa8-4665-b2da-7363d2bfa09e" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Total Number of parameters: 1018433\n" - ] - } - ], - "source": [ - "# How many trainable parameters does our model have?\n", - "model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n", - "params = sum([np.prod(p.size()) for p in model_parameters])\n", - "print('Total Number of parameters: ',params)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "Pc2OC5GDfjEG" - }, - "source": [ - "We choose the losses and the optimizer for the training procces." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "id": "iBWjPADUfjEG" - }, - "outputs": [], - "source": [ - "# loss and optimization functions\n", - "lr = 0.001\n", - "\n", - "# Binary crossentropy is a good loss function for a binary classification problem\n", - "criterion = nn.BCELoss()\n", - "\n", - "# We choose an Adam optimizer\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", - "\n", - "# function to predict accuracy\n", - "def acc(pred,label):\n", - " pred = torch.round(pred.squeeze())\n", - " return torch.sum(pred == label.squeeze()).item()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "OZgMwOe2fjEG" - }, - "source": [ - "We are ready to train our model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3B6YhEocfjEH", - "outputId": "76276a1f-7775-4b98-aab0-0e199aa133e4" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Epoch 1\n", - "train_loss : 0.4366412344621494 val_loss : 0.3881208170717582\n", - "train_accuracy : 79.485546875 val_accuracy : 82.475\n", - "Validation loss decreased (inf --> 0.388121). Saving model ...\n", - "==================================================\n", - "Epoch 2\n", - "train_loss : 0.3760281792609021 val_loss : 0.3713956154882908\n", - "train_accuracy : 83.186328125 val_accuracy : 83.4575\n", - "Validation loss decreased (0.388121 --> 0.371396). Saving model ...\n", - "==================================================\n", - "Epoch 3\n", - "train_loss : 0.3574051411205437 val_loss : 0.36425656544510276\n", - "train_accuracy : 84.19953125 val_accuracy : 83.80375\n", - "Validation loss decreased (0.371396 --> 0.364257). Saving model ...\n", - "==================================================\n", - "Epoch 4\n", - "train_loss : 0.344456663565943 val_loss : 0.3613302929420024\n", - "train_accuracy : 84.89265625 val_accuracy : 84.00874999999999\n", - "Validation loss decreased (0.364257 --> 0.361330). Saving model ...\n", - "==================================================\n", - "Epoch 5\n", - "train_loss : 0.33407817618339325 val_loss : 0.3601334386831149\n", - "train_accuracy : 85.444921875 val_accuracy : 84.03625\n", - "Validation loss decreased (0.361330 --> 0.360133). Saving model ...\n", - "==================================================\n" - ] - } - ], - "source": [ - "# Number of training Epochs\n", - "epochs = 5\n", - "\n", - "# Maximum absolute value accepted for the gradeint\n", - "clip = 5\n", - "\n", - "# Initial Loss value (assumed big)\n", - "valid_loss_min = np.Inf\n", - "\n", - "# Lists to follow the evolution of the loss and accuracy\n", - "epoch_tr_loss,epoch_vl_loss = [],[]\n", - "epoch_tr_acc,epoch_vl_acc = [],[]\n", - "\n", - "# Train for a number of Epochs\n", - "for epoch in range(epochs):\n", - " train_losses = []\n", - " train_acc = 0.0\n", - " model.train()\n", - "\n", - " for inputs, labels in train_loader:\n", - "\n", - " # Initialize hidden state\n", - " h = model.init_hidden(batch_size)\n", - " # Creating new variables for the hidden state\n", - " h = tuple([each.data.to(device) for each in h])\n", - "\n", - " # Move batch inputs and labels to gpu\n", - " inputs, labels = inputs.to(device), labels.to(device)\n", - "\n", - " # Set gradient to zero\n", - " model.zero_grad()\n", - "\n", - " # Compute model output\n", - " output,h = model(inputs,h)\n", - "\n", - " # Calculate the loss and perform backprop\n", - " loss = criterion(output.squeeze(), labels.float())\n", - " loss.backward()\n", - " train_losses.append(loss.item())\n", - "\n", - " # calculating accuracy\n", - " accuracy = acc(output,labels)\n", - " train_acc += accuracy\n", - "\n", - " #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.\n", - " nn.utils.clip_grad_norm_(model.parameters(), clip)\n", - " optimizer.step()\n", - "\n", - "\n", - " # Evaluate on the validation set for this epoch\n", - " val_losses = []\n", - " val_acc = 0.0\n", - " model.eval()\n", - " for inputs, labels in valid_loader:\n", - "\n", - " # Initialize hidden state\n", - " val_h = model.init_hidden(batch_size)\n", - " val_h = tuple([each.data.to(device) for each in val_h])\n", - "\n", - " # Move batch inputs and labels to gpu\n", - " inputs, labels = inputs.to(device), labels.to(device)\n", - "\n", - " # Compute model output\n", - " output, val_h = model(inputs, val_h)\n", - "\n", - " # Compute Loss\n", - " val_loss = criterion(output.squeeze(), labels.float())\n", - "\n", - " val_losses.append(val_loss.item())\n", - "\n", - " accuracy = acc(output,labels)\n", - " val_acc += accuracy\n", - "\n", - " epoch_train_loss = np.mean(train_losses)\n", - " epoch_val_loss = np.mean(val_losses)\n", - " epoch_train_acc = train_acc/len(train_loader.dataset)\n", - " epoch_val_acc = val_acc/len(valid_loader.dataset)\n", - " epoch_tr_loss.append(epoch_train_loss)\n", - " epoch_vl_loss.append(epoch_val_loss)\n", - " epoch_tr_acc.append(epoch_train_acc)\n", - " epoch_vl_acc.append(epoch_val_acc)\n", - " print(f'Epoch {epoch+1}')\n", - " print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')\n", - " print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')\n", - " if epoch_val_loss <= valid_loss_min:\n", - " print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,epoch_val_loss))\n", - " # torch.save(model.state_dict(), '../working/state_dict.pt')\n", - " valid_loss_min = epoch_val_loss\n", - " print(25*'==')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "execution": {}, - "colab": { - "base_uri": "https://localhost:8080/", - "height": 364 - }, - "id": "ttJazP-nfjEH", - "outputId": "992bed02-611e-4614-c60f-77223d5b801a" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": {} - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "execution": {}, + "id": "view-in-github" + }, + "source": [ + "\"Open   \"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "# Twitter Sentiment Analysis\n", + "\n", + "**By Neuromatch Academy**\n", + "\n", + "__Content creators:__ Juan Manuel Rodriguez, Salomey Osei, Gonzalo Uribarri\n", + "\n", + "__Production editors:__ Amita Kapoor, Spiros Chavlis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Welcome to the NLP project template\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Step 1: Questions and goals\n", + "\n", + "* Can we infer emotion from a tweet text?\n", + "* How words are distributed accross the dataset?\n", + "* Are words related to one kind of emotion?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Step 2: Literature review\n", + "\n", + "[Original Dataset Paper](https://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf)\n", + "\n", + "[Papers with code](https://paperswithcode.com/dataset/imdb-movie-reviews)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Step 3: Load and explore the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "execution": {} + }, + "outputs": [], + "source": [ + "# @title Install dependencies\n", + "!pip install pandas --quiet\n", + "!pip install torchtext --quiet\n", + "!pip install datasets --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# We import some libraries to load the dataset\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from collections import Counter\n", + "from tqdm.notebook import tqdm\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import TensorDataset, DataLoader\n", + "\n", + "import torchtext\n", + "from torchtext.data import get_tokenizer\n", + "\n", + "from sklearn.utils import shuffle\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "You can find the dataset we are going to use in [this website](https://huggingface.co/datasets/stanfordnlp/sentiment140)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset(\"stanfordnlp/sentiment140\", trust_remote_code= True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
polarityiddatequeryusertext
001467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
101467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
201467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
301467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
401467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n", + "
" ], - "source": [ - "fig = plt.figure(figsize = (20, 6))\n", - "plt.subplot(1, 2, 1)\n", - "plt.plot(epoch_tr_acc, label='Train Acc')\n", - "plt.plot(epoch_vl_acc, label='Validation Acc')\n", - "plt.title(\"Accuracy\")\n", - "plt.legend()\n", - "plt.grid()\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.plot(epoch_tr_loss, label='Train loss')\n", - "plt.plot(epoch_vl_loss, label='Validation loss')\n", - "plt.title(\"Loss\")\n", - "plt.legend()\n", - "plt.grid()\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "execution": {}, - "id": "iUyaF-EbfjEH" - }, - "source": [ - "---\n", - "# What's Next?\n", - "\n", - "You can use this project template as a starting point to think about your own project. There are a lot of ways to continue, here we share with you some ideas you migth find useful:\n", - "\n", - "* **Work on the Preproccesing.** We used a very rudimentary way to tokenize tweets. But there are better ways to preprocess the data. Can you think of a suitable way to preprocess the data for this particular task? How does the performance of the model change when the data is processed correctly?\n", - "* **Work on the Model.** The RNN model proposed in this notebook is not optimized at all. You can work on finding a better architecture or better hyperparamenters. May be using bidirectonal LSTMs or increasing the number of stacked layers can improve the performance, feel free to try different approaches.\n", - "* **Work on the Embedding.** Our model learnt an embedding during the training on this Twitter corpus for a particular task. You can explore the representation of different words in this learned embedding. Also, you can try using different word embeddings. You can train them on this corpus or you can use an embedding trained on another corpus of data. How does the change of the embedding affect the model performance?\n", - "* **Try sentiment analysis on another dataset.** There are lots of available dataset to work with, we can help you find one that is interesting to you. Do you belive that a sentiment analysis model trained on some corpus (Twitter dataset) will perform well on another type of data (for example, youtube comments)?\n", - "\n" - ] + "text/plain": [ + " polarity ... text\n", + "0 0 ... @switchfoot http://twitpic.com/2y1zl - Awww, t...\n", + "1 0 ... is upset that he can't update his Facebook by ...\n", + "2 0 ... @Kenichan I dived many times for the ball. Man...\n", + "3 0 ... my whole body feels itchy and like its on fire \n", + "4 0 ... @nationwideclass no, it's not behaving at all....\n", + "\n", + "[5 rows x 6 columns]" + ] + }, + "execution_count": 4, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "provenance": [] - }, - "kernel": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "fbb4191426bd485e8e965b6d432eecae": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_df7eba182d1b4c21bc21d157eac6b996", - "IPY_MODEL_6d64402d9da74516ab4e1d46ae9f1ee3", - "IPY_MODEL_d9ca809f7b1c49e595a05458251f3ab2" - ], - "layout": "IPY_MODEL_90908b6f69524a72860214ef8bd2d946" - } - }, - "df7eba182d1b4c21bc21d157eac6b996": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_db432a2cd6244a7592fc9732f0ca4738", - "placeholder": "​", - "style": "IPY_MODEL_84485541f3a14c65a67d10a97b72bbad", - "value": "Downloading builder script: 100%" - } - }, - "6d64402d9da74516ab4e1d46ae9f1ee3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_5fa7ab2ab2004e5cb692199e2bd27d6b", - "max": 4033, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ab71bd2b452146829e973d6cf99f31ed", - "value": 4033 - } - }, - "d9ca809f7b1c49e595a05458251f3ab2": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_55ba92cfe0724286ac1c2bbe6577e5c8", - "placeholder": "​", - "style": "IPY_MODEL_67a4fa49ca5349d58512a16a3742d401", - "value": " 4.03k/4.03k [00:00<00:00, 114kB/s]" - } - }, - "90908b6f69524a72860214ef8bd2d946": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "db432a2cd6244a7592fc9732f0ca4738": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "84485541f3a14c65a67d10a97b72bbad": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "5fa7ab2ab2004e5cb692199e2bd27d6b": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ab71bd2b452146829e973d6cf99f31ed": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "55ba92cfe0724286ac1c2bbe6577e5c8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "67a4fa49ca5349d58512a16a3742d401": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "afd671543846468abfe37669a72845c3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_057e918ace004506aedc4e4b9942c3a8", - "IPY_MODEL_325387f6b62d47b0b21bea61676cea72", - "IPY_MODEL_ea1e3eb0e6ec4f8d82cf9b12cfe6e700" - ], - "layout": "IPY_MODEL_96c2d7ee644a438982e1792b7ec0453c" - } - }, - "057e918ace004506aedc4e4b9942c3a8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9baa1a735c0646b89953bf4a7c7fc92c", - "placeholder": "​", - "style": "IPY_MODEL_0ac9711c8ece4c5397a8cd810713adfb", - "value": "Downloading readme: 100%" - } - }, - "325387f6b62d47b0b21bea61676cea72": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a8d69769921241b8b1081e84f7770858", - "max": 6837, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_d189f24b0e964d1a9fc86379bad38cca", - "value": 6837 - } - }, - "ea1e3eb0e6ec4f8d82cf9b12cfe6e700": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_db9bf44dec914db793cc4f73751c272c", - "placeholder": "​", - "style": "IPY_MODEL_1cf3ba0f756f4aa5ad1dcb675a791cfa", - "value": " 6.84k/6.84k [00:00<00:00, 157kB/s]" - } - }, - "96c2d7ee644a438982e1792b7ec0453c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9baa1a735c0646b89953bf4a7c7fc92c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0ac9711c8ece4c5397a8cd810713adfb": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a8d69769921241b8b1081e84f7770858": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d189f24b0e964d1a9fc86379bad38cca": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "db9bf44dec914db793cc4f73751c272c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1cf3ba0f756f4aa5ad1dcb675a791cfa": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "c432c4efcb794ce781fcb6f176f1b60d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_510eeffb32694e7798f23e3931d7a943", - "IPY_MODEL_a8b3dfaa2831416582d8eeef01451386", - "IPY_MODEL_db1cdafaf36f4c339476f3221abc17b3" - ], - "layout": "IPY_MODEL_ffd3778a96e046718828bbc5aa73f173" - } - }, - "510eeffb32694e7798f23e3931d7a943": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_49c5a3fbe87b491cb3f0f450a0af0659", - "placeholder": "​", - "style": "IPY_MODEL_252949e8784c4878a62eb2e30b1e3466", - "value": "Downloading data: 100%" - } - }, - "a8b3dfaa2831416582d8eeef01451386": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_7bcef602e7f441308472bc145b12dcd3", - "max": 81363704, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_97fb30a5a31742efa1d188b9361f9938", - "value": 81363704 - } - }, - "db1cdafaf36f4c339476f3221abc17b3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9b34daddb9cc48bba109e547177ec654", - "placeholder": "​", - "style": "IPY_MODEL_fd2b5a6533794a2794579956c25247fb", - "value": " 81.4M/81.4M [00:06<00:00, 15.3MB/s]" - } - }, - "ffd3778a96e046718828bbc5aa73f173": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "49c5a3fbe87b491cb3f0f450a0af0659": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "252949e8784c4878a62eb2e30b1e3466": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "7bcef602e7f441308472bc145b12dcd3": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "97fb30a5a31742efa1d188b9361f9938": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "9b34daddb9cc48bba109e547177ec654": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fd2b5a6533794a2794579956c25247fb": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f3a9667c8c994324a2409f227bd0a1e9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_6e6c5372ffe045c0b72587989567429e", - "IPY_MODEL_2ead0216695e4227aef44552f4ec3cc9", - "IPY_MODEL_53843f49adda4bce8450fd91fa9fd587" - ], - "layout": "IPY_MODEL_40262cb3eefa45fcbe37aaafccb69f5f" - } - }, - "6e6c5372ffe045c0b72587989567429e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b54b826314ea4b3a92eebd218c093fc1", - "placeholder": "​", - "style": "IPY_MODEL_8cd7be688b8c4818be48915db14a0792", - "value": "Generating train split: 100%" - } - }, - "2ead0216695e4227aef44552f4ec3cc9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a9a0f6ce71ed415c8c8513f68e34e162", - "max": 1600000, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7f638a6deacd42e88c031fa47797516b", - "value": 1600000 - } - }, - "53843f49adda4bce8450fd91fa9fd587": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_849e39cc86f64e558ff94bf542a5121a", - "placeholder": "​", - "style": "IPY_MODEL_67b0b03c391c414bad5ea9fb3c947a2f", - "value": " 1600000/1600000 [01:18<00:00, 14710.70 examples/s]" - } - }, - "40262cb3eefa45fcbe37aaafccb69f5f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b54b826314ea4b3a92eebd218c093fc1": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8cd7be688b8c4818be48915db14a0792": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a9a0f6ce71ed415c8c8513f68e34e162": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "7f638a6deacd42e88c031fa47797516b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "849e39cc86f64e558ff94bf542a5121a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "67b0b03c391c414bad5ea9fb3c947a2f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "1cef38981af6457dbaeb393f9936a389": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_b0b5cfae51214c60bbca9a09b196c217", - "IPY_MODEL_5ee2a4b33be04c6db8ee4d7995c2376d", - "IPY_MODEL_403fffb635c2409ebeabc90063750ed3" - ], - "layout": "IPY_MODEL_6279343019064572adedf34cfbd437fa" - } - }, - "b0b5cfae51214c60bbca9a09b196c217": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2715d00db77545f9aa5eace8a0eb2839", - "placeholder": "​", - "style": "IPY_MODEL_942ce490d87347c789e229589b1b9c9f", - "value": "Generating test split: 100%" - } - }, - "5ee2a4b33be04c6db8ee4d7995c2376d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f04df4daeb6049ab85d3d75b472ccf6e", - "max": 498, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_fd0b3c53b66543cea0c396d8047445a8", - "value": 498 - } - }, - "403fffb635c2409ebeabc90063750ed3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2c42e2fef6314c9e842a7e9641af3cab", - "placeholder": "​", - "style": "IPY_MODEL_913d95e58aa94e4a8009768a23fbf304", - "value": " 498/498 [00:00<00:00, 7393.07 examples/s]" - } - }, - "6279343019064572adedf34cfbd437fa": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2715d00db77545f9aa5eace8a0eb2839": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "942ce490d87347c789e229589b1b9c9f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "f04df4daeb6049ab85d3d75b472ccf6e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fd0b3c53b66543cea0c396d8047445a8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "2c42e2fef6314c9e842a7e9641af3cab": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "913d95e58aa94e4a8009768a23fbf304": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "e1348a02ceeb4af19fbd63d52b7d843b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_fbf51b14e6b34d0485ddf59c43d22c49", - "IPY_MODEL_c29e06a72ac9401b8c41f4195021071e", - "IPY_MODEL_48b812211db04284bfbbf02823fb879a" - ], - "layout": "IPY_MODEL_5455119809c74916acc50e1905903ded" - } - }, - "fbf51b14e6b34d0485ddf59c43d22c49": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_2475bd62a3224bacb38a6334d07d6a8c", - "placeholder": "​", - "style": "IPY_MODEL_3d29947b5d2d4e2abc1355d900096642", - "value": "100%" - } - }, - "c29e06a72ac9401b8c41f4195021071e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_3f7a8f56f15c434da70029366a37167a", - "max": 1280000, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_3610a2db297f4686bf9043f2b7ee55b5", - "value": 1280000 - } - }, - "48b812211db04284bfbbf02823fb879a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a1bd0616199e44538977ee2ea6049690", - "placeholder": "​", - "style": "IPY_MODEL_835fb9a91b34471fa6d61adf37616f52", - "value": " 1280000/1280000 [00:22<00:00, 77416.28it/s]" - } - }, - "5455119809c74916acc50e1905903ded": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "2475bd62a3224bacb38a6334d07d6a8c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3d29947b5d2d4e2abc1355d900096642": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "3f7a8f56f15c434da70029366a37167a": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "3610a2db297f4686bf9043f2b7ee55b5": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "a1bd0616199e44538977ee2ea6049690": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "835fb9a91b34471fa6d61adf37616f52": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d8de1a85076b453a92295e79110ba8fd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_78d48ee2fb9f42089f475fcf5fc368c8", - "IPY_MODEL_b0ca3012d0b84c5a9d7c1fc176251af7", - "IPY_MODEL_39fa73efcbf54d8dad225d8380061dbf" - ], - "layout": "IPY_MODEL_6b6cc35257fe433e93736d02e898b6b8" - } - }, - "78d48ee2fb9f42089f475fcf5fc368c8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e0fc900d8b5940a6bd6a97e58adb4651", - "placeholder": "​", - "style": "IPY_MODEL_6b7286d74e0f4a0199dbfcaf3dd0d622", - "value": "100%" - } - }, - "b0ca3012d0b84c5a9d7c1fc176251af7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a4bbd3df99cd4acab5e1b3ba5cd7c114", - "max": 320000, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_9a7140a6197945d5bac5c48b820dfb04", - "value": 320000 - } - }, - "39fa73efcbf54d8dad225d8380061dbf": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_0bdc146792a64853ae06a9d185aa2b15", - "placeholder": "​", - "style": "IPY_MODEL_768da964ffcd44fea1af09e81f5621f3", - "value": " 320000/320000 [00:06<00:00, 58691.43it/s]" - } - }, - "6b6cc35257fe433e93736d02e898b6b8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e0fc900d8b5940a6bd6a97e58adb4651": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6b7286d74e0f4a0199dbfcaf3dd0d622": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "a4bbd3df99cd4acab5e1b3ba5cd7c114": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9a7140a6197945d5bac5c48b820dfb04": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "0bdc146792a64853ae06a9d185aa2b15": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "768da964ffcd44fea1af09e81f5621f3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } + ], + "source": [ + "# We load the dataset\n", + "train_data = dataset[\"train\"]\n", + "df = pd.DataFrame(train_data)\n", + "df = df.rename(columns={'sentiment': 'polarity'})\n", + "df = df[['polarity', 'user', 'date', 'query', 'user', 'text']]\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "For this project we will use only the text and the polarity of the tweet. Notice that polarity is 0 for negative tweets and 4 for positive tweet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "X = df.text.values\n", + "\n", + "# Changes values from [0,4] to [0,1]\n", + "y = (df.polarity.values > 1).astype(int)\n", + "\n", + "\n", + "# Split the data into train and test\n", + "x_train_text, x_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "The first thing we have to do before working on the models is to familiarize ourselves with the dataset. This is called Exploratory Data Analisys (EDA)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1: @paisleypaisley LOL why do i get ideas so far in advance? it's not even june yet! we need a third knitter to have our own summer group \n", + "0: worst headache ever \n", + "0: @ewaniesciuszko i am so sad i wont see you! I miss you already. and yeah! that's perfect; i come back the 18th!\n", + "1: doesn't know how to spell conked \n", + "0: "So we stand here now and no one knows us at all I won't get used to this I won't get used to being gone"...I miss home and everyone -a\n" + ] + } + ], + "source": [ + "for s, l in zip(x_train_text[:5], y_train[:5]):\n", + " print('{}: {}'.format(l, s))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "An interesting thing to analyze is the Word Distribution. In order to count the occurrences of each word, we should tokenize the sentences first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before Tokenize: worst headache ever \n", + "After Tokenize: ['worst', 'headache', 'ever']\n" + ] + } + ], + "source": [ + "tokenizer = get_tokenizer(\"basic_english\")\n", + "\n", + "print('Before Tokenize: ', x_train_text[1])\n", + "print('After Tokenize: ', tokenizer(x_train_text[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "499e7fb54aa048afb3cba78dd8d6bb0e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=1280000.0), HTML(value='')))" + ] + }, + "metadata": { + "tags": [] + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fff9bd0ae74e46b0ad97ad980a834a58", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, max=320000.0), HTML(value='')))" + ] + }, + "metadata": { + "tags": [] + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "x_train_token = [tokenizer(s) for s in tqdm(x_train_text)]\n", + "x_test_token = [tokenizer(s) for s in tqdm(x_test_text)]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We can count the words occurences and see how many different words are present in our dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of different Tokens in our Dataset: 669284\n", + "['.', 'i', '!', \"'\", 'to', 'the', ',', 'a', 'my', 'it', 'and', 'you', '?', 'is', 'for', 'in', 's', 'of', 't', 'on', 'that', 'me', 'so', 'have', 'm', 'but', 'just', 'with', 'be', 'at', 'not', 'was', 'this', 'now', 'can', 'good', 'up', 'day', 'all', 'get', 'out', 'like', 'are', 'no', 'go', 'http', '-', 'today', 'do', 'too', 'your', 'work', 'going', 'love', 'we', 'got', 'what', 'lol', 'time', 'back', 'from', 'u', 'one', 'will', 'know', 'about', 'im', 'really', 'don', 'am', 'had', ')', 'see', 'some', 'there', 'its', '&', 'how', 'if', 'still', 'they', '"', 'night', '(', 'well', 'want', 'new', 'think', '2', 'home', 'thanks', 'll', 'oh', 'when', 'as', 'he', 'more', 'here', 'much', 'off']\n" + ] + } + ], + "source": [ + "words = Counter()\n", + "for s in x_train_token:\n", + " for w in s:\n", + " words[w] += 1\n", + "\n", + "sorted_words = list(words.keys())\n", + "sorted_words.sort(key=lambda w: words[w], reverse=True)\n", + "print(f\"Number of different Tokens in our Dataset: {len(sorted_words)}\")\n", + "print(sorted_words[:100])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Now we can plot their distribution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The 0.13970153178620734% most common words account for the 80.00532743602652% of the occurrences\n" + ] + } + ], + "source": [ + "count_occurences = sum(words.values())\n", + "\n", + "accumulated = 0\n", + "counter = 0\n", + "\n", + "while accumulated < count_occurences * 0.8:\n", + " accumulated += words[sorted_words[counter]]\n", + " counter += 1\n", + "\n", + "print(f\"The {counter * 100 / len(words)}% most common words \"\n", + " f\"account for the {accumulated * 100 / count_occurences}% of the occurrences\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEDCAYAAAAlRP8qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAARvUlEQVR4nO3dbZBeZ13H8e/PpkUBpWhWR5NqoqZoRBBcSxVHIuCYtk6jIzqNKOIU8sYiKj7EUYvWNyCOTzMFjFgrjLYWZDBDI1WxWkdt7VawNA2toVS6Fc3SFlQcLRn+vrhPmJvt7t5nk7O5d6/9fmZ29jxcPed/5kp/99nrPNypKiRJG9/nTLsASdIwDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEZMNdCTXJfkRJJ7erb//iT3Jjma5I/Wuj5J2kgyzfvQk3wb8N/AW6vqmRPa7gJuAl5YVY8l+eKqOnE26pSkjWCqZ+hVdRvw6PiyJF+V5D1J7kryt0m+plv1SuDaqnqs+28Nc0kasx7H0A8Br6qqbwR+Cnhjt/xC4MIkf5fk9iR7p1ahJK1DW6ZdwLgkTwW+BXh7klOLn9T93gLsAvYA24Hbknx9VX38bNcpSevRugp0Rn8xfLyqvmGJdfPAHVX1KeDDSe5nFPB3ns0CJWm9WldDLlX1n4zC+vsAMvLsbvW7GJ2dk2QroyGYB6ZRpyStR9O+bfEG4B+AZySZT3Il8FLgyiT/DBwF9nXNbwEeSXIvcCvw01X1yDTqlqT1aKq3LUqShrOuhlwkSadvahdFt27dWjt27JjW7iVpQ7rrrrs+VlUzS62bWqDv2LGDubm5ae1ekjakJP+63DqHXCSpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqRHr7X3ovew4ePNnph983WVTrESS1g/P0CWpEQa6JDXCQJekRhjoktSIiYGe5LokJ5Lcs0KbPUnen+Rokr8ZtkRJUh99ztCvB/YutzLJ+cAbgcur6uuA7xumNEnSakwM9Kq6DXh0hSY/ALyzqj7StT8xUG2SpFUYYgz9QuDpSf46yV1JXrZcwyQHkswlmVtYWBhg15KkU4YI9C3ANwKXAd8J/GKSC5dqWFWHqmq2qmZnZpb8SjxJ0mka4knReeCRqvok8MkktwHPBu4fYNuSpJ6GOEP/U+Bbk2xJ8mTgecCxAbYrSVqFiWfoSW4A9gBbk8wDrwXOBaiqN1fVsSTvAe4GPg28paqWvcVRkrQ2JgZ6Ve3v0eYNwBsGqUiSdFp8UlSSGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1IiJgZ7kuiQnkqz4LURJvinJySQvGa48SVJffc7Qrwf2rtQgyTnA64E/H6AmSdJpmBjoVXUb8OiEZq8C/gQ4MURRkqTVO+Mx9CTbgO8B3tSj7YEkc0nmFhYWznTXkqQxQ1wU/U3gZ6vq05MaVtWhqpqtqtmZmZkBdi1JOmXLANuYBW5MArAVuDTJyap61wDbliT1dMaBXlU7T00nuR54t2EuSWffxEBPcgOwB9iaZB54LXAuQFW9eU2rkyT1NjHQq2p/341V1cvPqBpJ0mnzSVFJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMmBnqS65KcSHLPMutfmuTuJB9I8vdJnj18mZKkSfqcoV8P7F1h/YeBF1TV1wO/AhwaoC5J0ir1+Qq625LsWGH934/N3g5sP/OyJEmrNfQY+pXAny23MsmBJHNJ5hYWFgbetSRtboMFepJvZxToP7tcm6o6VFWzVTU7MzMz1K4lSfQYcukjybOAtwCXVNUjQ2xTkrQ6Z3yGnuTLgXcCP1RV9595SZKk0zHxDD3JDcAeYGuSeeC1wLkAVfVm4Grgi4A3JgE4WVWza1WwJGlpfe5y2T9h/SuAVwxWkSTptPikqCQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpERMDPcl1SU4kuWeZ9Uny20mOJ7k7yXOHL1OSNEmfM/Trgb0rrL8E2NX9HADedOZlSZJWa2KgV9VtwKMrNNkHvLVGbgfOT/KlQxUoSepniDH0bcBDY/Pz3bInSHIgyVySuYWFhQF2LUk65axeFK2qQ1U1W1WzMzMzZ3PXktS8IQL9YeCCsfnt3TJJ0lk0RKAfBl7W3e1yMfCJqvroANuVJK3ClkkNktwA7AG2JpkHXgucC1BVbwaOAJcCx4H/AX5krYqVJC1vYqBX1f4J6wv40cEqkiSdFp8UlaRGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqRG9Aj3J3iT3JTme5OAS6788ya1J3pfk7iSXDl+qJGklEwM9yTnAtcAlwG5gf5Ldi5r9AnBTVT0HuAJ449CFSpJW1ucM/SLgeFU9UFWPAzcC+xa1KeALuumnAf82XImSpD4mfgUdsA14aGx+Hnjeoja/BPx5klcBTwFePEh1kqTehroouh+4vqq2M/rC6LclecK2kxxIMpdkbmFhYaBdS5KgX6A/DFwwNr+9WzbuSuAmgKr6B+Bzga2LN1RVh6pqtqpmZ2ZmTq9iSdKS+gT6ncCuJDuTnMfooufhRW0+ArwIIMnXMgp0T8El6SyaGOhVdRK4CrgFOMbobpajSa5JcnnX7DXAK5P8M3AD8PKqqrUqWpL0RH0uilJVR4Aji5ZdPTZ9L/D8YUvrZ8fBmz8z/eDrLptGCZK0LvikqCQ1wkCXpEYY6JLUiOYCfcfBmz9rXF2SNovmAl2SNisDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJakSvQE+yN8l9SY4nObhMm+9Pcm+So0n+aNgyJUmTTPzGoiTnANcC3wHMA3cmOdx9S9GpNruAnwOeX1WPJfnitSpYkrS0PmfoFwHHq+qBqnocuBHYt6jNK4Frq+oxgKo6MWyZkqRJ+gT6NuChsfn5btm4C4ELk/xdktuT7F1qQ0kOJJlLMrewsHB6FUuSljTURdEtwC5gD7Af+N0k5y9uVFWHqmq2qmZnZmYG2rUkCfoF+sPABWPz27tl4+aBw1X1qar6MHA/o4CXJJ0lfQL9TmBXkp1JzgOuAA4vavMuRmfnJNnKaAjmgQHrlCRNMDHQq+okcBVwC3AMuKmqjia5JsnlXbNbgEeS3AvcCvx0VT2yVkVLkp5o4m2LAFV1BDiyaNnVY9MF/GT3I0maAp8UlaRGGOiS1AgDXZIaYaBLUiMMdElqRNOBvuPgzew4ePO0y5Cks6LpQJekzcRAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY3YNIHuPemSWrdpAl2SWmegS1IjegV6kr1J7ktyPMnBFdp9b5JKMjtciZKkPiYGepJzgGuBS4DdwP4ku5do9/nAq4E7hi5yaI6nS2pRnzP0i4DjVfVAVT0O3AjsW6LdrwCvB/53wPokST31CfRtwENj8/Pdss9I8lzggqpa8bQ3yYEkc0nmFhYWVl2sJGl5Z3xRNMnnAL8OvGZS26o6VFWzVTU7MzNzprsehMMvklrRJ9AfBi4Ym9/eLTvl84FnAn+d5EHgYuCwF0Yl6ezqE+h3AruS7ExyHnAFcPjUyqr6RFVtraodVbUDuB24vKrm1qRiSdKSJgZ6VZ0ErgJuAY4BN1XV0STXJLl8rQuUJPWzpU+jqjoCHFm07Opl2u4587IkSavlk6KS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQb6GF8DIGkjM9AlqREGuiQ1wkCXpEYY6JLUCANdkhphoC/DO14kbTQGuiQ1wkDvYfxsfblpSZo2A12SGtEr0JPsTXJfkuNJDi6x/ieT3Jvk7iTvTfIVw5cqSVrJxEBPcg5wLXAJsBvYn2T3ombvA2ar6lnAO4BfHbpQSdLK+pyhXwQcr6oHqupx4EZg33iDqrq1qv6nm70d2D5smZKkSfoE+jbgobH5+W7Zcq4E/mypFUkOJJlLMrewsNC/yg3AC6SSpq3Xl0T3leQHgVngBUutr6pDwCGA2dnZGnLf68l4sD/4usumWImkzaRPoD8MXDA2v71b9lmSvBj4eeAFVfV/w5QnSeqrT6DfCexKspNRkF8B/MB4gyTPAX4H2FtVJwavcgNbbhjGM3dJQ5s4hl5VJ4GrgFuAY8BNVXU0yTVJLu+avQF4KvD2JO9PcnjNKpYkLanXGHpVHQGOLFp29dj0iweuS5K0SoNeFFV/XjiVNDQDfR1wnF3SEAz0dcygl7QaBvoG5HCNpKX4tsUG+EpfSWCgN81wlzYXA32T8Cxeap+BvskZ7lI7DHR9Rp+v2vMDQFq/DHSdNsNdWl+8bVGDOBXsD77uslWHvLdeSsMw0DV1p/MBsNQHiB8M2uwcclFTVnsdYKhpaT3wDF0awFoHu399qA8DXdoglhtmmvb0mfLDajgGuqSp6vPBsB4+uDbCh1ivMfQke5Pcl+R4koNLrH9Skj/u1t+RZMfQhUqSVjYx0JOcA1wLXALsBvYn2b2o2ZXAY1X11cBvAK8fulBJ0sr6nKFfBByvqgeq6nHgRmDfojb7gD/opt8BvChJhitTkjRJqmrlBslLgL1V9Ypu/oeA51XVVWNt7unazHfzH+rafGzRtg4AB7rZZwD3nWH9W4GPTWzVFo95c/CYN4fTOeavqKqZpVac1YuiVXUIODTU9pLMVdXsUNvbCDzmzcFj3hyGPuY+Qy4PAxeMzW/vli3ZJskW4GnAI0MUKEnqp0+g3wnsSrIzyXnAFcDhRW0OAz/cTb8E+KuaNJYjSRrUxCGXqjqZ5CrgFuAc4LqqOprkGmCuqg4Dvwe8Lclx4FFGoX82DDZ8s4F4zJuDx7w5DHrMEy+KSpI2Bl/OJUmNMNAlqREbMtAnvYqgBUkuSHJrknuTHE3y6m75Fyb5iyT/0v1++rRrHVqSc5K8L8m7u/md3SsljnevmDhv2jUOKcn5Sd6R5INJjiX55tb7OclPdP+u70lyQ5LPba2fk1yX5ET3nM6pZUv2a0Z+uzv2u5M893T2ueECveerCFpwEnhNVe0GLgZ+tDvOg8B7q2oX8N5uvjWvBo6Nzb8e+I3u1RKPMXrVREt+C3hPVX0N8GxGx95sPyfZBvwYMFtVz2R0s8UVtNfP1wN7Fy1brl8vAXZ1PweAN53ODjdcoNPvVQQbXlV9tKr+qZv+L0b/k2/js1+z8AfAd0+nwrWRZDtwGfCWbj7ACxm9UgIaO+YkTwO+jdGdYlTV41X1cRrvZ0Z32H1e99zKk4GP0lg/V9VtjO76G7dcv+4D3lojtwPnJ/nS1e5zIwb6NuChsfn5blmzurdXPge4A/iSqvpot+rfgS+ZUllr5TeBnwE+3c1/EfDxqjrZzbfW3zuBBeD3u2GmtyR5Cg33c1U9DPwa8BFGQf4J4C7a7udTluvXQXJtIwb6ppLkqcCfAD9eVf85vq57eKuZ+06TfBdwoqrumnYtZ9EW4LnAm6rqOcAnWTS80mA/P53RGelO4MuAp/DEoYnmrUW/bsRA7/MqgiYkOZdRmP9hVb2zW/wfp/4U636fmFZ9a+D5wOVJHmQ0lPZCRuPL53d/mkN7/T0PzFfVHd38OxgFfMv9/GLgw1W1UFWfAt7JqO9b7udTluvXQXJtIwZ6n1cRbHjd2PHvAceq6tfHVo2/ZuGHgT8927Wtlar6uaraXlU7GPXrX1XVS4FbGb1SAto75n8HHkryjG7Ri4B7abifGQ21XJzkyd2/81PH3Gw/j1muXw8DL+vudrkY+MTY0Ex/VbXhfoBLgfuBDwE/P+161ugYv5XRn2N3A+/vfi5lNKb8XuBfgL8EvnData7R8e8B3t1NfyXwj8Bx4O3Ak6Zd38DH+g3AXNfX7wKe3no/A78MfBC4B3gb8KTW+hm4gdE1gk8x+kvsyuX6FQiju/c+BHyA0R1Aq96nj/5LUiM24pCLJGkJBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqxP8D+q4d+O9Hiz8AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light", + "tags": [] + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.bar(range(100), [words[w] for w in sorted_words[:100]])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "It is very common to find this kind of distribution when analyzing corpus of text. This is referred to as the [zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Usually the number of words in the dictionary will be very large. \n", + "\n", + "Here are some thing we can do to reduce that number:\n", + "\n", + "* Remove puntuation.\n", + "* Remove stop-words.\n", + "* Steaming.\n", + "* Remove very uncommon words (the words that appears in fewer than N occations).\n", + "* Nothing: we can use a pretrain model that handles this kind of situations.\n", + "\n", + "\n", + "We used one of the simplest tokenizers availables. This tokenizer does not take into account many quirks of the language. Moreover, diferent languages have different quirks, so there is no \"universal\" tokenizers. There are many libraries that have \"better\" tokenizers:\n", + "\n", + "* [Spacy](https://spacy.io/): it can be accessed using: `get_tokenizer(\"spacy\")`. Spacy supports a wide range of languages.\n", + "* [Huggingface](https://huggingface.co/): it has many tokenizers for different laguages. [Doc](https://huggingface.co/transformers/main_classes/tokenizer.html)\n", + "* [NLTK](https://www.nltk.org/): it provides several tokenizers. One of them can be accessed using: `get_tokenizer(\"toktok\")`\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# Step 4: choose toolkit\n", + "\n", + "Our goal is to train a model capable of estimating the sentiment of a tweet (positive or negative) by reading its content. To that end we will try 2 different approaches:\n", + "\n", + "* A logistic regression using sklearn. **NOTE**: it can probaly work better than an SVM model.\n", + "* A simple Embedding + RNN." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Logistic regression\n", + "\n", + "We will represent our senteces using binary vectorization. This means that our data would be represented as a matrix of instances by word with a one if the word is in the instance, and zero otherwise. Sklean vectorizers can also do things such as stop-word removal and puntuation removal, you can read more about in [the documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "vectorizer = CountVectorizer(binary=True)\n", + "x_train_cv = vectorizer.fit_transform(x_train_text)\n", + "x_test_cv = vectorizer.transform(x_test_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before Vectorize: doesn't know how to spell conked \n" + ] + } + ], + "source": [ + "print('Before Vectorize: ', x_train_text[3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After Vectorize: \n", + " (0, 528584)\t1\n", + " (0, 165468)\t1\n", + " (0, 300381)\t1\n", + " (0, 242211)\t1\n", + " (0, 489893)\t1\n", + " (0, 134160)\t1\n" + ] + } + ], + "source": [ + "# Notice that the matriz is sparse\n", + "print('After Vectorize: ')\n", + "print(x_train_cv[3])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Now we can train our model. You can check the documentation of this logistic regressor [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html?highlight=logistic#sklearn.linear_model.LogisticRegression)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, l1_ratio=None, max_iter=100,\n", + " multi_class='auto', n_jobs=None, penalty='l2',\n", + " random_state=None, solver='saga', tol=0.0001, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 15, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "model = LogisticRegression(solver='saga')\n", + "model.fit(x_train_cv, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.81 0.79 0.80 160000\n", + " 1 0.79 0.81 0.80 160000\n", + "\n", + " accuracy 0.80 320000\n", + " macro avg 0.80 0.80 0.80 320000\n", + "weighted avg 0.80 0.80 0.80 320000\n", + "\n" + ] + } + ], + "source": [ + "y_pred = model.predict(x_test_cv)\n", + "\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Explainable AI\n", + "The best thing about logistic regresion is that it is simple, and we can get some explanations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1, 589260)\n", + "589260\n" + ] + } + ], + "source": [ + "print(model.coef_.shape)\n", + "print(len(vectorizer.vocabulary_))\n", + "\n", + "words_sk = list(vectorizer.vocabulary_.keys())\n", + "words_sk.sort(key=lambda w: model.coef_[0, vectorizer.vocabulary_[w]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "roni: -3.862597673594883\n", + "inaperfectworld: -3.5734362290886375\n", + "dontyouhate: -3.500197620227523\n", + "xbllygbsn: -3.412645372640648\n", + "anqju: -3.336405291553548\n", + "sad: -3.200522312464158\n", + "pakcricket: -3.1949158120163412\n", + "condolences: -3.132498019366488\n", + "heartbreaking: -3.066508733796654\n", + "saddest: -3.041999809733714\n", + "sadd: -3.029070563580306\n", + "heartbroken: -3.0287688233900174\n", + "boohoo: -3.022608649696793\n", + "sadface: -2.9918411285807234\n", + "rachelle_lefevr: -2.925057253107806\n", + "disappointing: -2.902524113779547\n", + "lvbu: -2.894705935001672\n", + "saddens: -2.8855127179984654\n", + "bummed: -2.83650014970307\n", + "neda: -2.792944556837498\n" + ] + } + ], + "source": [ + "for w in words_sk[:20]:\n", + " print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "iamsoannoyed: 2.8494314732277672\n", + "myfax: 2.797451563471618\n", + "jennamadison: 2.5667257393706113\n", + "yeyy: 2.478028598852801\n", + "tryout: 2.4383315790116677\n", + "goldymom: 2.4374026022205535\n", + "wooohooo: 2.40297322137544\n", + "thesupergirl: 2.3565118467330004\n", + "iammaxathotspot: 2.311648368632618\n", + "londicreations: 2.3074490293400993\n", + "smilin: 2.2991891636718216\n", + "worries: 2.2899429774914717\n", + "sinfulsignorita: 2.2798963640981817\n", + "finchensnail: 2.264302079155878\n", + "smackthis: 2.2376679263761083\n", + "kv: 2.2158393907798413\n", + "tojosan: 2.211784259253832\n", + "russmarshalek: 2.2095374025599384\n", + "traciknoppe: 2.1768297770350835\n", + "congratulations: 2.171590496227557\n" + ] + } + ], + "source": [ + "for w in reversed(words_sk[-20:]):\n", + " print('{}: {}'.format(w, model.coef_[0, vectorizer.vocabulary_[w]]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "What does this mean?\n", + "\n", + "Remember the `model.coef_` is the $W$ in:\n", + "\n", + "$$h(x)=\\sigma(WX + b)$$\n", + "\n", + "where the label 1 is a positive tweet and the label 0 is a negative tweet." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "## Recurrent Neural Network with Pytorch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "In the previous section we use a Bag-Of-Words approach to represent each of the tweets. That meas that we only consider how many times each of the words appear in each of the tweets, we didnt take into account the order of the words. But we know that the word order is very important and carries relevant information.\n", + "\n", + "In this section we will solve the same task, but this time we will implement a Recurrent Neural Network (RNN) instead of using a simple Logistic Regression.Unlike feedforward neural networks, RNNs have cyclic connections making them powerful for modeling sequences.\n", + "\n", + "Let's start by importing the relevant libraries.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "def set_device():\n", + " device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + " if device != \"cuda\":\n", + " print(\"WARNING: For this notebook to perform best, \"\n", + " \"if possible, in the menu under `Runtime` -> \"\n", + " \"`Change runtime type.` select `GPU` \")\n", + " else:\n", + " print(\"GPU is enabled in this notebook.\")\n", + "\n", + " return device" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPU is enabled in this notebook.\n" + ] + } + ], + "source": [ + "# Set the device (check if gpu is available)\n", + "device = set_device()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "First we will create a Dictionary (`word_to_idx`). This dictionary will map each Token (usually words) to an index (an integer number). We want to limit our dictionary to a certain number of tokens (`num_words_dict`), so we will include in our ditionary those with more occurrences." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['.', 'i', '!', \"'\", 'to', 'the', ',', 'a', 'my', 'it']" + ] + }, + "execution_count": 22, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "# From previous section, we have a list with the most used tokens\n", + "sorted_words[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Let's select only the most used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "num_words_dict = 30000\n", + "# We reserve two numbers for special tokens.\n", + "most_used_words = sorted_words[:num_words_dict-2]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We will add two extra Tokens to the dictionary, one for words outside the dictionary (`'UNK'`) and one for padding the sequences (`'PAD'`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# dictionary to go from words to idx\n", + "word_to_idx = {}\n", + "# dictionary to go from idx to words (just in case)\n", + "idx_to_word = {}\n", + "\n", + "\n", + "# We include the special tokens first\n", + "PAD_token = 0\n", + "UNK_token = 1\n", + "\n", + "word_to_idx['PAD'] = PAD_token\n", + "word_to_idx['UNK'] = UNK_token\n", + "\n", + "idx_to_word[PAD_token] = 'PAD'\n", + "idx_to_word[UNK_token] = 'UNK'\n", + "\n", + "# We popullate our dictionaries with the most used words\n", + "for num,word in enumerate(most_used_words):\n", + " word_to_idx[word] = num + 2\n", + " idx_to_word[num+2] = word" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Our goal now is to transform each tweet from a sequence of tokens to a sequence of indexes. These sequences of indexes will be the input to our pytorch model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# A function to convert list of tokens to list of indexes\n", + "def tokens_to_idx(sentences_tokens,word_to_idx):\n", + " sentences_idx = []\n", + " for sent in sentences_tokens:\n", + " sent_idx = []\n", + " for word in sent:\n", + " if word in word_to_idx:\n", + " sent_idx.append(word_to_idx[word])\n", + " else:\n", + " sent_idx.append(word_to_idx['UNK'])\n", + " sentences_idx.append(sent_idx)\n", + " return sentences_idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "x_train_idx = tokens_to_idx(x_train_token,word_to_idx)\n", + "x_test_idx = tokens_to_idx(x_test_token,word_to_idx)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before converting: ['worst', 'headache', 'ever']\n", + "After converting: [721, 458, 237]\n" + ] + } + ], + "source": [ + "some_number = 1\n", + "print('Before converting: ', x_train_token[some_number])\n", + "print('After converting: ', x_train_idx[some_number])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We need all the sequences to have the same length. To select an adequate sequence length, let's explore some statistics about the length of the tweets:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Max tweet word length: 229\n", + "Mean tweet word length: 15.0\n", + "99% percent under: 37.0\n" + ] + } + ], + "source": [ + "tweet_lens = np.asarray([len(sentence) for sentence in x_train_idx])\n", + "print('Max tweet word length: ',tweet_lens.max())\n", + "print('Mean tweet word length: ',np.median(tweet_lens))\n", + "print('99% percent under: ',np.quantile(tweet_lens,0.99))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We cut the sequences which are larger than our chosen maximum length (`max_lenght`) and fill with zeros the ones that are shorter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + " # We choose the max length\n", + " max_length = 40\n", + "\n", + "# A function to make all the sequence have the same lenght\n", + "# Note that the output is a Numpy matrix\n", + " def padding(sentences, seq_len):\n", + " features = np.zeros((len(sentences), seq_len),dtype=int)\n", + " for ii, tweet in enumerate(sentences):\n", + " len_tweet = len(tweet)\n", + " if len_tweet != 0:\n", + " if len_tweet <= seq_len:\n", + " # If its shorter, we fill with zeros (the padding Token index)\n", + " features[ii, -len(tweet):] = np.array(tweet)[:seq_len]\n", + " if len_tweet > seq_len:\n", + " # If its larger, we take the last 'seq_len' indexes\n", + " features[ii, :] = np.array(tweet)[-seq_len:]\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# We convert our list of tokens into a numpy matrix\n", + "# where all instances have the same lenght\n", + "x_train_pad = padding(x_train_idx,max_length)\n", + "x_test_pad = padding(x_test_idx,max_length)\n", + "\n", + "# We convert our target list a numpy matrix\n", + "y_train_np = np.asarray(y_train)\n", + "y_test_np = np.asarray(y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before padding: [1, 3, 71, 24, 122, 3, 533, 74, 13, 4, 3, 102, 13, 209, 2, 12, 150, 4, 22, 5, 18, 667, 3, 138, 61, 7, 3296, 4]\n", + "After padding: [ 0 0 0 0 0 0 0 0 0 0 0 0 1 3\n", + " 71 24 122 3 533 74 13 4 3 102 13 209 2 12\n", + " 150 4 22 5 18 667 3 138 61 7 3296 4]\n" + ] + } + ], + "source": [ + "some_number = 2\n", + "print('Before padding: ', x_train_idx[some_number])\n", + "print('After padding: ', x_train_pad[some_number])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Now, let's convert the data to pytorch format.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# create Tensor datasets\n", + "train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train_np))\n", + "valid_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test_np))\n", + "\n", + "# Batch size (this is an important hyperparameter)\n", + "batch_size = 100\n", + "\n", + "# dataloaders\n", + "# make sure to SHUFFLE your data\n", + "train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last = True)\n", + "valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size,drop_last = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Each batch of data in our traning proccess will have the folllowing format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sample input size: torch.Size([100, 40])\n", + "Sample input: \n", + " tensor([[ 0, 0, 0, ..., 4, 4, 4],\n", + " [ 0, 0, 0, ..., 7447, 14027, 2],\n", + " [ 0, 0, 0, ..., 100, 22241, 4],\n", + " ...,\n", + " [ 0, 0, 0, ..., 2702, 4409, 2],\n", + " [ 0, 0, 0, ..., 162, 17, 1],\n", + " [ 0, 0, 0, ..., 67, 12904, 49]])\n", + "Sample input: \n", + " tensor([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,\n", + " 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,\n", + " 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,\n", + " 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,\n", + " 0, 0, 1, 0])\n" + ] + } + ], + "source": [ + "# Obtain one batch of training data\n", + "dataiter = iter(train_loader)\n", + "sample_x, sample_y = dataiter.__next__()\n", + "\n", + "print('Sample input size: ', sample_x.size()) # batch_size, seq_length\n", + "print('Sample input: \\n', sample_x)\n", + "print('Sample input: \\n', sample_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "Now, we will define the `SentimentRNN` class. Most of the model's class will be familiar to you, but there are two important layers we would like you to pay attention to:\n", + "\n", + "* Embedding Layer\n", + "> This layer is like a linear layer, but it makes it posible to use a sequence of inedexes as inputs (instead of a sequence of one-hot-encoded vectors). During training, the Embedding layer learns a linear transformation from the space of words (a vector space of dimension `num_words_dict`) into the a new, smaller, vector space of dimension `embedding_dim`. We suggest you to read this [thread](https://discuss.pytorch.org/t/how-does-nn-embedding-work/88518/3) and the [pytorch documentation](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) if you want to learn more about this particular kind of layers.\n", + "\n", + "\n", + "* LSTM layer\n", + "> This is one of the most used class of Recurrent Neural Networks. In Pytorch we can add several stacked layers in just one line of code. In our case, the number of layers added are decided with the parameter `no_layers`. If you want to learn more about LSTMs we strongly recommend you this [Colahs thread](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) about them.\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "class SentimentRNN(nn.Module):\n", + " def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.1):\n", + " super(SentimentRNN,self).__init__()\n", + "\n", + " self.output_dim = output_dim\n", + " self.hidden_dim = hidden_dim\n", + " self.no_layers = no_layers\n", + " self.vocab_size = vocab_size\n", + " self.drop_prob = drop_prob\n", + "\n", + " # Embedding Layer\n", + " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", + "\n", + " # LSTM Layers\n", + " self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,\n", + " num_layers=no_layers, batch_first=True,\n", + " dropout=self.drop_prob)\n", + "\n", + " # Dropout layer\n", + " self.dropout = nn.Dropout(drop_prob)\n", + "\n", + " # Linear and Sigmoid layer\n", + " self.fc = nn.Linear(self.hidden_dim, output_dim)\n", + " self.sig = nn.Sigmoid()\n", + "\n", + " def forward(self,x,hidden):\n", + " batch_size = x.size(0)\n", + "\n", + " # Embedding out\n", + " embeds = self.embedding(x)\n", + " #Shape: [batch_size x max_length x embedding_dim]\n", + "\n", + " # LSTM out\n", + " lstm_out, hidden = self.lstm(embeds, hidden)\n", + " # Shape: [batch_size x max_length x hidden_dim]\n", + "\n", + " # Select the activation of the last Hidden Layer\n", + " lstm_out = lstm_out[:,-1,:].contiguous()\n", + " # Shape: [batch_size x hidden_dim]\n", + "\n", + " ## You can instead average the activations across all the times\n", + " # lstm_out = torch.mean(lstm_out, 1).contiguous()\n", + "\n", + " # Dropout and Fully connected layer\n", + " out = self.dropout(lstm_out)\n", + " out = self.fc(out)\n", + "\n", + " # Sigmoid function\n", + " sig_out = self.sig(out)\n", + "\n", + " # return last sigmoid output and hidden state\n", + " return sig_out, hidden\n", + "\n", + " def init_hidden(self, batch_size):\n", + " ''' Initializes hidden state '''\n", + " # Create two new tensors with sizes n_layers x batch_size x hidden_dim,\n", + " # initialized to zero, for hidden state and cell state of LSTM\n", + " h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n", + " c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)\n", + " hidden = (h0,c0)\n", + " return hidden" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We choose the parameters of the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# Parameters of our network\n", + "\n", + "# Size of our vocabulary\n", + "vocab_size = num_words_dict\n", + "\n", + "# Embedding dimension\n", + "embedding_dim = 32\n", + "\n", + "# Number of stacked LSTM layers\n", + "no_layers = 2\n", + "\n", + "# Dimension of the hidden layer in LSTMs\n", + "hidden_dim = 64\n", + "\n", + "# Dropout parameter for regularization\n", + "output_dim = 1\n", + "\n", + "# Dropout parameter for regularization\n", + "drop_prob = 0.25" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SentimentRNN(\n", + " (embedding): Embedding(30000, 32)\n", + " (lstm): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.25)\n", + " (dropout): Dropout(p=0.25, inplace=False)\n", + " (fc): Linear(in_features=64, out_features=1, bias=True)\n", + " (sig): Sigmoid()\n", + ")\n" + ] + } + ], + "source": [ + "# Let's define our model\n", + "model = SentimentRNN(no_layers, vocab_size, hidden_dim,\n", + " embedding_dim, drop_prob=drop_prob)\n", + "# Moving to gpu\n", + "model.to(device)\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total Number of parameters: 1018433\n" + ] } + ], + "source": [ + "# How many trainable parameters does our model have?\n", + "model_parameters = filter(lambda p: p.requires_grad, model.parameters())\n", + "params = sum([np.prod(p.size()) for p in model_parameters])\n", + "print('Total Number of parameters: ',params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We choose the losses and the optimizer for the training procces." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [], + "source": [ + "# loss and optimization functions\n", + "lr = 0.001\n", + "\n", + "# Binary crossentropy is a good loss function for a binary classification problem\n", + "criterion = nn.BCELoss()\n", + "\n", + "# We choose an Adam optimizer\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", + "\n", + "# function to predict accuracy\n", + "def acc(pred,label):\n", + " pred = torch.round(pred.squeeze())\n", + " return torch.sum(pred == label.squeeze()).item()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "We are ready to train our model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1\n", + "train_loss : 0.4367361353733577 val_loss : 0.39174133955966683\n", + "train_accuracy : 79.530625 val_accuracy : 82.3628125\n", + "Validation loss decreased (inf --> 0.391741). Saving model ...\n", + "==================================================\n", + "Epoch 2\n", + "train_loss : 0.3765802335098851 val_loss : 0.3724124691961333\n", + "train_accuracy : 83.19140625 val_accuracy : 83.42031250000001\n", + "Validation loss decreased (0.391741 --> 0.372412). Saving model ...\n", + "==================================================\n", + "Epoch 3\n", + "train_loss : 0.35746844720793886 val_loss : 0.365050206175074\n", + "train_accuracy : 84.16882812499999 val_accuracy : 83.7440625\n", + "Validation loss decreased (0.372412 --> 0.365050). Saving model ...\n", + "==================================================\n", + "Epoch 4\n", + "train_loss : 0.34491546426317654 val_loss : 0.36467386982403693\n", + "train_accuracy : 84.879140625 val_accuracy : 83.77\n", + "Validation loss decreased (0.365050 --> 0.364674). Saving model ...\n", + "==================================================\n", + "Epoch 5\n", + "train_loss : 0.33429012800217606 val_loss : 0.36189084346871825\n", + "train_accuracy : 85.44296875 val_accuracy : 84.0221875\n", + "Validation loss decreased (0.364674 --> 0.361891). Saving model ...\n", + "==================================================\n" + ] + } + ], + "source": [ + "# Number of training Epochs\n", + "epochs = 5\n", + "\n", + "# Maximum absolute value accepted for the gradeint\n", + "clip = 5\n", + "\n", + "# Initial Loss value (assumed big)\n", + "valid_loss_min = np.Inf\n", + "\n", + "# Lists to follow the evolution of the loss and accuracy\n", + "epoch_tr_loss,epoch_vl_loss = [],[]\n", + "epoch_tr_acc,epoch_vl_acc = [],[]\n", + "\n", + "# Train for a number of Epochs\n", + "for epoch in range(epochs):\n", + " train_losses = []\n", + " train_acc = 0.0\n", + " model.train()\n", + "\n", + " for inputs, labels in train_loader:\n", + "\n", + " # Initialize hidden state\n", + " h = model.init_hidden(batch_size)\n", + " # Creating new variables for the hidden state\n", + " h = tuple([each.data.to(device) for each in h])\n", + "\n", + " # Move batch inputs and labels to gpu\n", + " inputs, labels = inputs.to(device), labels.to(device)\n", + "\n", + " # Set gradient to zero\n", + " model.zero_grad()\n", + "\n", + " # Compute model output\n", + " output,h = model(inputs,h)\n", + "\n", + " # Calculate the loss and perform backprop\n", + " loss = criterion(output.squeeze(), labels.float())\n", + " loss.backward()\n", + " train_losses.append(loss.item())\n", + "\n", + " # calculating accuracy\n", + " accuracy = acc(output,labels)\n", + " train_acc += accuracy\n", + "\n", + " #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.\n", + " nn.utils.clip_grad_norm_(model.parameters(), clip)\n", + " optimizer.step()\n", + "\n", + "\n", + " # Evaluate on the validation set for this epoch\n", + " val_losses = []\n", + " val_acc = 0.0\n", + " model.eval()\n", + " for inputs, labels in valid_loader:\n", + "\n", + " # Initialize hidden state\n", + " val_h = model.init_hidden(batch_size)\n", + " val_h = tuple([each.data.to(device) for each in val_h])\n", + "\n", + " # Move batch inputs and labels to gpu\n", + " inputs, labels = inputs.to(device), labels.to(device)\n", + "\n", + " # Compute model output\n", + " output, val_h = model(inputs, val_h)\n", + "\n", + " # Compute Loss\n", + " val_loss = criterion(output.squeeze(), labels.float())\n", + "\n", + " val_losses.append(val_loss.item())\n", + "\n", + " accuracy = acc(output,labels)\n", + " val_acc += accuracy\n", + "\n", + " epoch_train_loss = np.mean(train_losses)\n", + " epoch_val_loss = np.mean(val_losses)\n", + " epoch_train_acc = train_acc/len(train_loader.dataset)\n", + " epoch_val_acc = val_acc/len(valid_loader.dataset)\n", + " epoch_tr_loss.append(epoch_train_loss)\n", + " epoch_vl_loss.append(epoch_val_loss)\n", + " epoch_tr_acc.append(epoch_train_acc)\n", + " epoch_vl_acc.append(epoch_val_acc)\n", + " print(f'Epoch {epoch+1}')\n", + " print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')\n", + " print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')\n", + " if epoch_val_loss <= valid_loss_min:\n", + " print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,epoch_val_loss))\n", + " # torch.save(model.state_dict(), '../working/state_dict.pt')\n", + " valid_loss_min = epoch_val_loss\n", + " print(25*'==')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": {} + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize = (20, 6))\n", + "plt.subplot(1, 2, 1)\n", + "plt.plot(epoch_tr_acc, label='Train Acc')\n", + "plt.plot(epoch_vl_acc, label='Validation Acc')\n", + "plt.title(\"Accuracy\")\n", + "plt.legend()\n", + "plt.grid()\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "plt.plot(epoch_tr_loss, label='Train loss')\n", + "plt.plot(epoch_vl_loss, label='Validation loss')\n", + "plt.title(\"Loss\")\n", + "plt.legend()\n", + "plt.grid()\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "execution": {} + }, + "source": [ + "---\n", + "# What's Next?\n", + "\n", + "You can use this project template as a starting point to think about your own project. There are a lot of ways to continue, here we share with you some ideas you migth find useful:\n", + "\n", + "* **Work on the Preproccesing.** We used a very rudimentary way to tokenize tweets. But there are better ways to preprocess the data. Can you think of a suitable way to preprocess the data for this particular task? How does the performance of the model change when the data is processed correctly?\n", + "* **Work on the Model.** The RNN model proposed in this notebook is not optimized at all. You can work on finding a better architecture or better hyperparamenters. May be using bidirectonal LSTMs or increasing the number of stacked layers can improve the performance, feel free to try different approaches.\n", + "* **Work on the Embedding.** Our model learnt an embedding during the training on this Twitter corpus for a particular task. You can explore the representation of different words in this learned embedding. Also, you can try using different word embeddings. You can train them on this corpus or you can use an embedding trained on another corpus of data. How does the change of the embedding affect the model performance?\n", + "* **Try sentiment analysis on another dataset.** There are lots of available dataset to work with, we can help you find one that is interesting to you. Do you belive that a sentiment analysis model trained on some corpus (Twitter dataset) will perform well on another type of data (for example, youtube comments)?\n", + "\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "sentiment_analysis", + "provenance": [], + "toc_visible": true + }, + "kernel": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From d159bb5065981f035e2e3bf9444322d47fdcc16e Mon Sep 17 00:00:00 2001 From: Soan Kim <39689481+SoanKim@users.noreply.github.com> Date: Tue, 9 Jul 2024 01:40:18 +0900 Subject: [PATCH 08/12] Update lunar_lander.ipynb deleted duplicated minigrid installation --- projects/ReinforcementLearning/lunar_lander.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/projects/ReinforcementLearning/lunar_lander.ipynb b/projects/ReinforcementLearning/lunar_lander.ipynb index 24a8e0d37..b02af964d 100644 --- a/projects/ReinforcementLearning/lunar_lander.ipynb +++ b/projects/ReinforcementLearning/lunar_lander.ipynb @@ -92,7 +92,6 @@ "!pip install pyvirtualdisplay --quiet\n", "!pip install pyglet --quiet\n", "!pip install pygame --quiet\n", - "!pip install minigrid --quiet\n", "!pip install -q gymnasium[box2d] --quiet\n", "!pip install 'minigrid<=2.1.1' --quiet\n", "!pip3 install box2d-py --quiet" From 3edc9188881fd43a197c245582c442862888cd7f Mon Sep 17 00:00:00 2001 From: Zoltan Date: Mon, 8 Jul 2024 17:01:52 -0400 Subject: [PATCH 09/12] Update notebook-pr.yaml ci:execute --- .github/workflows/notebook-pr.yaml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/notebook-pr.yaml b/.github/workflows/notebook-pr.yaml index 5d3a3f64c..b14d492da 100644 --- a/.github/workflows/notebook-pr.yaml +++ b/.github/workflows/notebook-pr.yaml @@ -15,7 +15,22 @@ jobs: runs-on: ubuntu-latest steps: - + - name: Free up disk space + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + - name: Checkout uses: actions/checkout@v3 with: From 8fa1bd6412ef53d8e10a7a59720be896b8db73e8 Mon Sep 17 00:00:00 2001 From: Zoltan Date: Mon, 8 Jul 2024 17:16:32 -0400 Subject: [PATCH 10/12] Update requirements.txt ci:execute --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index ab36879c3..4b78c17c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ torchvision pathlib xkcd decorator==5.0.9 +pyvirtualdisplay From 93cccbfb8aa2cb6089d3821193341190a6c16310 Mon Sep 17 00:00:00 2001 From: Zoltan Date: Mon, 8 Jul 2024 18:36:26 -0400 Subject: [PATCH 11/12] Update notebook-pr.yaml ci:execute --- .github/workflows/notebook-pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/notebook-pr.yaml b/.github/workflows/notebook-pr.yaml index b14d492da..5cbdfd03d 100644 --- a/.github/workflows/notebook-pr.yaml +++ b/.github/workflows/notebook-pr.yaml @@ -63,6 +63,7 @@ jobs: - name: Install dependencies if: "!contains(env.COMMIT_MESSAGE, 'skip ci') && contains(env.COMMIT_MESSAGE, 'ci:execute')" run: | + sudo apt-get update && sudo apt install xvfb -y python -m pip install --upgrade pip wheel pip install -r requirements.txt pip install jupyter_client==7.3.5 # downgrade jupyter-client to fix hangs From 942e35c5229f79ea2c6ae113731692eaff2b76b2 Mon Sep 17 00:00:00 2001 From: Zoltan Date: Mon, 8 Jul 2024 19:33:12 -0400 Subject: [PATCH 12/12] update reqs and use of np.inf ci:execute --- projects/NaturalLanguageProcessing/sentiment_analysis.ipynb | 2 +- requirements.txt | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb b/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb index 87c305bde..70b008cb9 100644 --- a/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb +++ b/projects/NaturalLanguageProcessing/sentiment_analysis.ipynb @@ -1504,7 +1504,7 @@ "clip = 5\n", "\n", "# Initial Loss value (assumed big)\n", - "valid_loss_min = np.Inf\n", + "valid_loss_min = np.inf\n", "\n", "# Lists to follow the evolution of the loss and accuracy\n", "epoch_tr_loss,epoch_vl_loss = [],[]\n", diff --git a/requirements.txt b/requirements.txt index 4b78c17c3..b8d8390a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,5 @@ pathlib xkcd decorator==5.0.9 pyvirtualdisplay +tensorboard +moviepy