Skip to content

Commit

Permalink
Fix cache
Browse files Browse the repository at this point in the history
  • Loading branch information
Olivie Franklova (CZ) authored and Olivie Franklova (CZ) committed May 16, 2024
1 parent c9f5b17 commit c96e609
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 115 deletions.
25 changes: 20 additions & 5 deletions column2Vec/Column2Vec.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
This file contains column2Vec implementations.
"""
import json
import re

import numpy as np
Expand All @@ -14,10 +15,11 @@ class Cache:
"""
__cache = pd.DataFrame()
__read_from_file = False
__on = True

def __read(self):
try:
self.__cache = pd.read_csv("generated/cache.txt")
self.__cache = pd.io.parsers.read_csv("generated/cache.txt", index_col=0)
except Exception as error:
pass

Expand All @@ -29,11 +31,13 @@ def get_cache(self, key: str, function: str) -> list | None:
:param function: Name of function
:return: Cache for a specific key
"""
if not self.__on:
return None
if not self.__read_from_file:
self.__read()
self.__read_from_file = True
if function in self.__cache.index and key in self.__cache.columns:
return self.__cache.loc[function, key]
return json.loads(self.__cache.loc[function, key]) # json is faster than ast
return None

def save(self, key: str, function: str, embedding: list):
Expand All @@ -43,15 +47,26 @@ def save(self, key: str, function: str, embedding: list):
:param function: Function name
:param embedding: to save
"""
print(f"|{int(function)}| : |{int(key)}|") # todo solve this
self.__cache.at[function, key] = embedding
if not self.__on:
return
self.__cache.loc[function, key] = str(list(embedding))
print(f"{self.__cache.loc[function, key]}")
# self.__cache.loc[function, key] = embedding

def save_persistently(self):
"""
Write cache to csv file
"""
self.__cache.to_csv("generated/cache.txt")
if not self.__on:
return
print(self.__cache.index)
print(self.__cache.columns)
self.__cache.to_csv("generated/cache.txt", index=True)

def off(self):
self.__on = False
def on(self):
self.__on = True


cache = Cache()
Expand Down
38 changes: 36 additions & 2 deletions column2Vec/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,20 @@ We have implemented seven different approaches.
Folder [**generated**](generated) contains all generated files.
Mostly html files representing
2D clusters, created by clustering vectors.
It also contains cache file where could be stored created embeddings.
Cashing is possible to switch of or switch on.

file [**Column2Vec.py**](Column2Vec.py) contains 7 different implementations of column2Vec.

File [**Column2Vec.py**](Column2Vec.py) contains 7 different implementations of column2Vec.
All implementations could use cache.
There is implemented run time cache and persistent cache stored in cache.txt in folder generated.
To store cache persistently it is necessary to call:
```python
cache.save_persistently()
```
Run time cache will be done automatically. For switching off caching, you have to call:
```python
cache.off()
```
## Implementation description
- **column2vec_as_sentence** creates one string from column, and then it transforms it to vector
- **column2vec_as_sentence_clean** creates one string from column. String contains only numbers and a-z. Then it transforms clean string in to vector.
Expand All @@ -21,6 +32,29 @@ file [**Column2Vec.py**](Column2Vec.py) contains 7 different implementations of
- **column2vec_weighted_sum** transforms every element in column into vector and then sum it.

> Inspired by [Michael J. Mior, Alexander G. Ororbia](https://arxiv.org/pdf/1903.08621)
File [**functions.py**](functions.py) contains functions for using column2Vec.
It contains functions:
- get_nonnumerical_data (returns string columns)
- get_vectors (creates embeddings)
- get_clusters (create clusters and give a list of them)
- plot_clusters (create and plots clusters)
- compute_distances (compute distance betwen vectors)

## How to use
You can create vectors(embeddings) by using one of the seven implementations.
You have to call `get_vectors` function with two parameters. ()
First parameter is implementation of column2vec,
second parameter is data that is dictionary (string:column).
You should use only nonnumerical data you can get them from table by running `get_nonumerical_data`.
```python
vectors_avg = get_vectors(column2vec_avg, data)
```

If you want to create clusters or plot them,
you can call get_clusters or plot_clusters.
For getting distances between vectors you can run `compute_distances`.

---
# Data and cluster description
#### Used tables
Expand Down
2 changes: 2 additions & 0 deletions column2Vec/generated/cache.txt

Large diffs are not rendered by default.

141 changes: 33 additions & 108 deletions column2Vec/playground.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-05-14T14:15:40.050299Z",
"start_time": "2024-05-14T14:15:33.880395Z"
"end_time": "2024-05-16T09:45:32.005491Z",
"start_time": "2024-05-16T09:45:26.322440Z"
}
},
"id": "d2f663cd8db4d03b",
Expand All @@ -34,8 +34,8 @@
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-05-14T14:15:40.057810Z",
"start_time": "2024-05-14T14:15:40.052810Z"
"end_time": "2024-05-16T09:45:32.012744Z",
"start_time": "2024-05-16T09:45:32.006527Z"
}
},
"outputs": [],
Expand All @@ -49,7 +49,8 @@
"fileM1 = \"../data/imdb_top_1000.csv\"\n",
"fileM2 = \"../data/netflix_titles.csv\"\n",
"#make an array of all the files\n",
"files = [fileA1, fileA2, fileC1, fileC2, fileC3, fileM1, fileM2]\n",
"# files = [fileA1, fileA2, fileC1, fileC2, fileC3, fileM1, fileM2]\n",
"files = [fileA1]\n",
"\n",
"# dataA1 = pd.read_csv(fileA1)\n",
"# dataA2 = pd.read_csv(fileA2)\n",
Expand Down Expand Up @@ -91,8 +92,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-05-14T14:15:40.066746Z",
"start_time": "2024-05-14T14:15:40.059808Z"
"end_time": "2024-05-16T09:45:32.025242Z",
"start_time": "2024-05-16T09:45:32.014963Z"
}
},
"id": "19c03920fae6aab8",
Expand All @@ -105,87 +106,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
" ../data/aircraft-data_nov_dec.csv : reg_city\n",
" ../data/aircraft-data_nov_dec.csv : reg_state\n",
" ../data/aircraft-data_nov_dec.csv : tail_number\n",
" ../data/aircraft-data_nov_dec.csv : reg_city\n",
" ../data/aircraft-data_nov_dec.csv : flight\n",
" ../data/aircraft-data_nov_dec.csv : tail_number\n",
" ../data/aircraft-data_nov_dec.csv : reg_expiration\n",
" ../data/aircraft-data_nov_dec.csv : reg_owner\n",
" ../data/aircraft-data_nov_dec.csv : manufacturer\n",
" ../data/aircraft-data_nov_dec.csv : reg_owner\n",
" ../data/aircraft-data_nov_dec.csv : model\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname AG identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n",
" warnings.warn(\"tzname {tzname} identified but not understood. \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" ../data/Airplane_Cleaned.csv : TP mods\n",
" ../data/Airplane_Cleaned.csv : Multi Engine\n",
" ../data/Airplane_Cleaned.csv : Engine Type\n",
" ../data/Airplane_Cleaned.csv : Company\n",
" ../data/Airplane_Cleaned.csv : Model\n",
" ../data/autoscout24-germany-dataset.csv : make\n",
" ../data/autoscout24-germany-dataset.csv : gear\n",
" ../data/autoscout24-germany-dataset.csv : model\n",
" ../data/autoscout24-germany-dataset.csv : offerType\n",
" ../data/autoscout24-germany-dataset.csv : fuel\n",
" ../data/CARS_1.csv : fuel_type\n",
" ../data/CARS_1.csv : transmission_type\n",
" ../data/CARS_1.csv : car_name\n",
" ../data/CARS_1.csv : body_type\n",
" ../data/USA_cars_datasets.csv : model\n",
" ../data/USA_cars_datasets.csv : brand\n",
" ../data/USA_cars_datasets.csv : country\n",
" ../data/USA_cars_datasets.csv : vin\n",
" ../data/USA_cars_datasets.csv : title_status\n",
" ../data/USA_cars_datasets.csv : condition\n",
" ../data/USA_cars_datasets.csv : state\n",
" ../data/USA_cars_datasets.csv : color\n",
" ../data/imdb_top_1000.csv : Poster_Link\n",
" ../data/imdb_top_1000.csv : Gross\n",
" ../data/imdb_top_1000.csv : Certificate\n",
" ../data/imdb_top_1000.csv : Series_Title\n",
" ../data/imdb_top_1000.csv : Star3\n",
" ../data/imdb_top_1000.csv : Director\n",
" ../data/imdb_top_1000.csv : Star2\n",
" ../data/imdb_top_1000.csv : Star1\n",
" ../data/imdb_top_1000.csv : Star4\n",
" ../data/imdb_top_1000.csv : Overview\n",
" ../data/imdb_top_1000.csv : Genre\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname ELSIE identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n",
" warnings.warn(\"tzname {tzname} identified but not understood. \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" ../data/netflix_titles.csv : show_id\n",
" ../data/netflix_titles.csv : title\n",
" ../data/netflix_titles.csv : description\n",
" ../data/netflix_titles.csv : director\n",
" ../data/netflix_titles.csv : cast\n",
" ../data/netflix_titles.csv : country\n",
" ../data/netflix_titles.csv : rating\n",
" ../data/netflix_titles.csv : type\n",
" ../data/netflix_titles.csv : duration\n",
" ../data/netflix_titles.csv : listed_in\n",
" ../data/netflix_titles.csv : date_added\n"
]
}
],
"source": [
Expand All @@ -196,8 +125,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-05-14T14:16:20.221739Z",
"start_time": "2024-05-14T14:15:40.068761Z"
"end_time": "2024-05-16T09:45:42.139139Z",
"start_time": "2024-05-16T09:45:32.026381Z"
}
},
"id": "cfe57003e670ba15",
Expand Down Expand Up @@ -232,12 +161,12 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-26T09:58:32.218154Z",
"start_time": "2024-03-26T09:58:32.213907Z"
"end_time": "2024-05-16T09:45:42.145314Z",
"start_time": "2024-05-16T09:45:42.140194Z"
}
},
"id": "19c881d9f450b556",
"execution_count": 16
"execution_count": 5
},
{
"cell_type": "code",
Expand All @@ -246,38 +175,34 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Processing column: reg_city1 1.92%\n"
]
},
{
"ename": "ValueError",
"evalue": "invalid literal for int() with base 10: 'column2vec_avg'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[5], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m vectors_avg \u001B[38;5;241m=\u001B[39m \u001B[43mget_vectors\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcolumn2vec_avg\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdata\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\Desktop\\thesis\\simillarity\\column2Vec\\functions.py:61\u001B[0m, in \u001B[0;36mget_vectors\u001B[1;34m(function, data)\u001B[0m\n\u001B[0;32m 59\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m key \u001B[38;5;129;01min\u001B[39;00m data:\n\u001B[0;32m 60\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mProcessing column: \u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m+\u001B[39m key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m \u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m+\u001B[39m \u001B[38;5;28mstr\u001B[39m(\u001B[38;5;28mround\u001B[39m((count \u001B[38;5;241m/\u001B[39m \u001B[38;5;28mlen\u001B[39m(data)) \u001B[38;5;241m*\u001B[39m \u001B[38;5;241m100\u001B[39m, \u001B[38;5;241m2\u001B[39m)) \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m%\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m---> 61\u001B[0m result[key] \u001B[38;5;241m=\u001B[39m \u001B[43mfunction\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtrained_model\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_module\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 62\u001B[0m count \u001B[38;5;241m+\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m1\u001B[39m\n\u001B[0;32m 63\u001B[0m end \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mtime()\n",
"File \u001B[1;32m~\\Desktop\\thesis\\simillarity\\column2Vec\\Column2Vec.py:159\u001B[0m, in \u001B[0;36mcolumn2vec_avg\u001B[1;34m(column, model, key)\u001B[0m\n\u001B[0;32m 157\u001B[0m encoded_columns \u001B[38;5;241m=\u001B[39m model\u001B[38;5;241m.\u001B[39mencode(column_clean)\n\u001B[0;32m 158\u001B[0m to_ret \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mmean(encoded_columns, axis\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m) \u001B[38;5;66;03m# counts arithmetic mean (average)\u001B[39;00m\n\u001B[1;32m--> 159\u001B[0m \u001B[43mcache\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msave\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfunction_string\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mto_ret\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 160\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m to_ret\n",
"File \u001B[1;32m~\\Desktop\\thesis\\simillarity\\column2Vec\\Column2Vec.py:46\u001B[0m, in \u001B[0;36mCache.save\u001B[1;34m(self, key, function, embedding)\u001B[0m\n\u001B[0;32m 39\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21msave\u001B[39m(\u001B[38;5;28mself\u001B[39m, key: \u001B[38;5;28mstr\u001B[39m, function: \u001B[38;5;28mstr\u001B[39m, embedding: \u001B[38;5;28mlist\u001B[39m):\n\u001B[0;32m 40\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m 41\u001B[0m \u001B[38;5;124;03m Saves cache\u001B[39;00m\n\u001B[0;32m 42\u001B[0m \u001B[38;5;124;03m :param key: Column name\u001B[39;00m\n\u001B[0;32m 43\u001B[0m \u001B[38;5;124;03m :param function: Function name\u001B[39;00m\n\u001B[0;32m 44\u001B[0m \u001B[38;5;124;03m :param embedding: to save\u001B[39;00m\n\u001B[0;32m 45\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m---> 46\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m|\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28;43mint\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mfunction\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m| : |\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mint\u001B[39m(key)\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m|\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 47\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m__cache\u001B[38;5;241m.\u001B[39mat[function, key] \u001B[38;5;241m=\u001B[39m embedding\n",
"\u001B[1;31mValueError\u001B[0m: invalid literal for int() with base 10: 'column2vec_avg'"
"Processing column: reg_state1 12.5%\n",
"Processing column: reg_city1 25.0%\n",
"Processing column: flight1 37.5%\n",
"Processing column: tail_number1 50.0%\n",
"Processing column: reg_expiration1 62.5%\n",
"Processing column: manufacturer1 75.0%\n",
"Processing column: reg_owner1 87.5%\n",
"Processing column: model1 100.0%\n",
"ELAPSED TIME :0.005300045013427734\n"
]
}
],
"source": [
"\n",
"\n",
"vectors_avg = get_vectors(column2vec_avg, data)"
"from column2Vec.Column2Vec import cache\n",
"cache.off()\n",
"vectors_avg = get_vectors(column2vec_avg, data)\n",
"# vectors_avg2 = get_vectors(column2vec_avg, data)\n",
"# cache.save_persistently()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-05-14T14:16:23.159447Z",
"start_time": "2024-05-14T14:16:20.222742Z"
"end_time": "2024-05-16T09:45:42.163420Z",
"start_time": "2024-05-16T09:45:42.145921Z"
}
},
"id": "d18443a1c921f509",
"execution_count": 5
"execution_count": 6
},
{
"cell_type": "markdown",
Expand Down

0 comments on commit c96e609

Please sign in to comment.