Codecademy · brobinson234 · Nov 13, 2020 · Nov 22, 2020 · Nov 30, 2020 · Nov 30, 2020
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,23 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        },
+        {
+            "type": "pwa-chrome",
+            "request": "launch",
+            "name": "Launch Chrome against localhost",
+            "url": "http://localhost:8080",
+            "webRoot": "${workspaceFolder}"
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,4 @@
+{
+    "jupyter.jupyterServerType": "local",
+    "python.pythonPath": "C:\\Users\\brobi\\miniconda3\\python.exe"
+}
diff --git a/CodeExamples/Comparing Words via word embeddings.ipynb b/CodeExamples/Comparing Words via word embeddings.ipynb
@@ -0,0 +1,51 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": 3
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#This requires a list of words and a list of their corresponding vectors\n",
+    "\n",
+    "import spacy\n",
+    "from scipy.spatial.distance import cosine\n",
+    "from processing import most_common_words, vector_list\n",
+    "\n",
+    "# print word and vector representation at index 347\n",
+    "#print(most_common_words[347])\n",
+    "#print(vector_list[347])\n",
+    "\n",
+    "# define find_closest_words\n",
+    "def find_closest_words(word_list, vector_list, word_to_check):\n",
+    "    return sorted(word_list,\n",
+    "                  key=lambda x: cosine(vector_list[word_list.index(word_to_check)], vector_list[word_list.index(x)]))[:10]\n",
+    "\n",
+    "# find closest words to food\n",
+    "close_to_food = find_closest_words(most_common_words, vector_list, 'food')\n",
+    "print(close_to_food)\n",
+    "\n",
+    "# find closest words to summer\n",
+    "close_to_summer = find_closest_words(most_common_words, vector_list, 'summer')\n",
+    "print(close_to_summer)\n"
+   ]
+  }
+ ]
+}
diff --git a/CodeExamples/Features Vector.ipynb b/CodeExamples/Features Vector.ipynb
@@ -0,0 +1,76 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": 3
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk, re\n",
+    "from nltk.corpus import wordnet\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from collections import Counter\n",
+    "\n",
+    "stop_words = stopwords.words('english')\n",
+    "normalizer = WordNetLemmatizer()\n",
+    "\n",
+    "def get_part_of_speech(word):\n",
+    "  probable_part_of_speech = wordnet.synsets(word)\n",
+    "  pos_counts = Counter()\n",
+    "  pos_counts[\"n\"] = len(  [ item for item in probable_part_of_speech if item.pos()==\"n\"]  )\n",
+    "  pos_counts[\"v\"] = len(  [ item for item in probable_part_of_speech if item.pos()==\"v\"]  )\n",
+    "  pos_counts[\"a\"] = len(  [ item for item in probable_part_of_speech if item.pos()==\"a\"]  )\n",
+    "  pos_counts[\"r\"] = len(  [ item for item in probable_part_of_speech if item.pos()==\"r\"]  )\n",
+    "  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]\n",
+    "  return most_likely_part_of_speech\n",
+    "\n",
+    "def preprocess_text(text):\n",
+    "  cleaned = re.sub(r'\\W+', ' ', text).lower()\n",
+    "  tokenized = word_tokenize(cleaned)\n",
+    "  normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]\n",
+    "  return normalized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from preprocessing import preprocess_text\n",
+    "# Define text_to_bow_vector() below:\n",
+    "def text_to_bow_vector(some_text, features_dictionary):\n",
+    "  bow_vector = [0] * ((len(features_dictionary)))\n",
+    "  tokens = preprocess_text(some_text)\n",
+    "  for token in tokens:\n",
+    "    feature_index = features_dictionary[token]\n",
+    "    bow_vector[feature_index] += 1\n",
+    "  return bow_vector, tokens\n",
+    "\n",
+    "features_dictionary = {'function': 8, 'please': 14, 'find': 6, 'five': 0, 'with': 12, 'fantastic': 1, 'my': 11, 'another': 10, 'a': 13, 'maybe': 9, 'to': 5, 'off': 4, 'faraway': 7, 'fish': 2, 'fly': 3}\n",
+    "\n",
+    "text = \"Another five fish find another faraway fish.\"\n",
+    "print(text_to_bow_vector(text, features_dictionary)[0])"
+   ]
+  }
+ ]
+}