Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Robinson #2

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [

{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
},
{
"type": "pwa-chrome",
"request": "launch",
"name": "Launch Chrome against localhost",
"url": "http://localhost:8080",
"webRoot": "${workspaceFolder}"
}
]
}
4 changes: 4 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"jupyter.jupyterServerType": "local",
"python.pythonPath": "C:\\Users\\brobi\\miniconda3\\python.exe"
}
51 changes: 51 additions & 0 deletions CodeExamples/Comparing Words via word embeddings.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#This requires a list of words and a list of their corresponding vectors\n",
"\n",
"import spacy\n",
"from scipy.spatial.distance import cosine\n",
"from processing import most_common_words, vector_list\n",
"\n",
"# print word and vector representation at index 347\n",
"#print(most_common_words[347])\n",
"#print(vector_list[347])\n",
"\n",
"# define find_closest_words\n",
"def find_closest_words(word_list, vector_list, word_to_check):\n",
" return sorted(word_list,\n",
" key=lambda x: cosine(vector_list[word_list.index(word_to_check)], vector_list[word_list.index(x)]))[:10]\n",
"\n",
"# find closest words to food\n",
"close_to_food = find_closest_words(most_common_words, vector_list, 'food')\n",
"print(close_to_food)\n",
"\n",
"# find closest words to summer\n",
"close_to_summer = find_closest_words(most_common_words, vector_list, 'summer')\n",
"print(close_to_summer)\n"
]
}
]
}
76 changes: 76 additions & 0 deletions CodeExamples/Features Vector.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import nltk, re\n",
"from nltk.corpus import wordnet\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.stem import WordNetLemmatizer\n",
"from collections import Counter\n",
"\n",
"stop_words = stopwords.words('english')\n",
"normalizer = WordNetLemmatizer()\n",
"\n",
"def get_part_of_speech(word):\n",
" probable_part_of_speech = wordnet.synsets(word)\n",
" pos_counts = Counter()\n",
" pos_counts[\"n\"] = len( [ item for item in probable_part_of_speech if item.pos()==\"n\"] )\n",
" pos_counts[\"v\"] = len( [ item for item in probable_part_of_speech if item.pos()==\"v\"] )\n",
" pos_counts[\"a\"] = len( [ item for item in probable_part_of_speech if item.pos()==\"a\"] )\n",
" pos_counts[\"r\"] = len( [ item for item in probable_part_of_speech if item.pos()==\"r\"] )\n",
" most_likely_part_of_speech = pos_counts.most_common(1)[0][0]\n",
" return most_likely_part_of_speech\n",
"\n",
"def preprocess_text(text):\n",
" cleaned = re.sub(r'\\W+', ' ', text).lower()\n",
" tokenized = word_tokenize(cleaned)\n",
" normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]\n",
" return normalized"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from preprocessing import preprocess_text\n",
"# Define text_to_bow_vector() below:\n",
"def text_to_bow_vector(some_text, features_dictionary):\n",
" bow_vector = [0] * ((len(features_dictionary)))\n",
" tokens = preprocess_text(some_text)\n",
" for token in tokens:\n",
" feature_index = features_dictionary[token]\n",
" bow_vector[feature_index] += 1\n",
" return bow_vector, tokens\n",
"\n",
"features_dictionary = {'function': 8, 'please': 14, 'find': 6, 'five': 0, 'with': 12, 'fantastic': 1, 'my': 11, 'another': 10, 'a': 13, 'maybe': 9, 'to': 5, 'off': 4, 'faraway': 7, 'fish': 2, 'fly': 3}\n",
"\n",
"text = \"Another five fish find another faraway fish.\"\n",
"print(text_to_bow_vector(text, features_dictionary)[0])"
]
}
]
}
Loading