Fix cache

AbsaOSS · May 16, 2024 · c96e609 · c96e609
1 parent c9f5b17
commit c96e609
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 115 deletions.
diff --git a/column2Vec/Column2Vec.py b/column2Vec/Column2Vec.py
@@ -1,6 +1,7 @@
 """
 This file contains column2Vec implementations.
 """
+import json
 import re
 
 import numpy as np
@@ -14,10 +15,11 @@ class Cache:
     """
     __cache = pd.DataFrame()
     __read_from_file = False
+    __on = True
 
     def __read(self):
         try:
-            self.__cache = pd.read_csv("generated/cache.txt")
+            self.__cache = pd.io.parsers.read_csv("generated/cache.txt", index_col=0)
         except Exception as error:
             pass
 
@@ -29,11 +31,13 @@ def get_cache(self, key: str, function: str) -> list | None:
         :param function: Name of function
         :return: Cache for a specific key
         """
+        if not self.__on:
+            return None
         if not self.__read_from_file:
             self.__read()
             self.__read_from_file = True
         if function in self.__cache.index and key in self.__cache.columns:
-            return self.__cache.loc[function, key]
+            return json.loads(self.__cache.loc[function, key]) # json is faster than ast
         return None
 
     def save(self, key: str, function: str, embedding: list):
@@ -43,15 +47,26 @@ def save(self, key: str, function: str, embedding: list):
         :param function: Function name
         :param embedding: to save
         """
-        print(f"|{int(function)}| : |{int(key)}|") # todo solve this
-        self.__cache.at[function, key] = embedding
+        if not self.__on:
+            return
+        self.__cache.loc[function, key] = str(list(embedding))
+        print(f"{self.__cache.loc[function, key]}")
         # self.__cache.loc[function, key] = embedding
 
     def save_persistently(self):
         """
         Write cache to csv file
         """
-        self.__cache.to_csv("generated/cache.txt")
+        if not self.__on:
+            return
+        print(self.__cache.index)
+        print(self.__cache.columns)
+        self.__cache.to_csv("generated/cache.txt", index=True)
+
+    def off(self):
+        self.__on = False
+    def on(self):
+        self.__on = True
 
 
 cache = Cache()

diff --git a/column2Vec/README.md b/column2Vec/README.md
@@ -8,9 +8,20 @@ We have implemented seven different approaches.
 Folder [**generated**](generated) contains all generated files.
 Mostly html files representing
 2D clusters, created by clustering vectors. 
+It also contains cache file where could be stored created embeddings.
+Cashing is possible to switch of or switch on.
 
-file [**Column2Vec.py**](Column2Vec.py) contains 7 different implementations of column2Vec.
-
+File [**Column2Vec.py**](Column2Vec.py) contains 7 different implementations of column2Vec.
+All implementations could use cache.
+There is implemented run time cache and persistent cache stored in cache.txt in folder generated.
+To store cache persistently it is necessary to call:
+```python
+cache.save_persistently()
+```
+Run time cache will be done automatically. For switching off caching, you have to call:
+```python
+cache.off()
+```
 ## Implementation description
 - **column2vec_as_sentence** creates one string from column, and then it transforms it to vector
 - **column2vec_as_sentence_clean** creates one string from column. String contains only numbers and a-z. Then it transforms clean string in to vector.
@@ -21,6 +32,29 @@ file [**Column2Vec.py**](Column2Vec.py) contains 7 different implementations of
 - **column2vec_weighted_sum** transforms every element in column into vector and then sum it.
 
 > Inspired by [Michael J. Mior, Alexander G. Ororbia](https://arxiv.org/pdf/1903.08621)
+
+File [**functions.py**](functions.py) contains functions for using column2Vec.
+It contains functions:
+- get_nonnumerical_data (returns string columns)
+- get_vectors (creates embeddings)
+- get_clusters (create clusters and give a list of them)
+- plot_clusters (create and plots clusters)
+- compute_distances (compute distance betwen vectors)
+
+## How to use
+You can create vectors(embeddings) by using one of the seven implementations.
+You have to call `get_vectors` function with two parameters. ()
+First parameter is implementation of column2vec,
+second parameter is data that is dictionary (string:column).
+You should use only nonnumerical data you can get them from table by running `get_nonumerical_data`.
+```python
+vectors_avg = get_vectors(column2vec_avg, data)
+```
+
+If you want to create clusters or plot them, 
+you can call get_clusters or plot_clusters.
+For getting distances between vectors you can run `compute_distances`.
+
 ---
 # Data and cluster description
 #### Used tables

diff --git a/column2Vec/generated/cache.txt b/column2Vec/generated/cache.txt
diff --git a/column2Vec/playground.ipynb b/column2Vec/playground.ipynb
@@ -20,8 +20,8 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-14T14:15:40.050299Z",
-     "start_time": "2024-05-14T14:15:33.880395Z"
+     "end_time": "2024-05-16T09:45:32.005491Z",
+     "start_time": "2024-05-16T09:45:26.322440Z"
     }
    },
    "id": "d2f663cd8db4d03b",
@@ -34,8 +34,8 @@
    "metadata": {
     "collapsed": true,
     "ExecuteTime": {
-     "end_time": "2024-05-14T14:15:40.057810Z",
-     "start_time": "2024-05-14T14:15:40.052810Z"
+     "end_time": "2024-05-16T09:45:32.012744Z",
+     "start_time": "2024-05-16T09:45:32.006527Z"
     }
    },
    "outputs": [],
@@ -49,7 +49,8 @@
     "fileM1 = \"../data/imdb_top_1000.csv\"\n",
     "fileM2 = \"../data/netflix_titles.csv\"\n",
     "#make an array of all the files\n",
-    "files = [fileA1, fileA2, fileC1, fileC2, fileC3, fileM1, fileM2]\n",
+    "# files = [fileA1, fileA2, fileC1, fileC2, fileC3, fileM1, fileM2]\n",
+    "files = [fileA1]\n",
     "\n",
     "# dataA1 = pd.read_csv(fileA1)\n",
     "# dataA2 = pd.read_csv(fileA2)\n",
@@ -91,8 +92,8 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-14T14:15:40.066746Z",
-     "start_time": "2024-05-14T14:15:40.059808Z"
+     "end_time": "2024-05-16T09:45:32.025242Z",
+     "start_time": "2024-05-16T09:45:32.014963Z"
     }
    },
    "id": "19c03920fae6aab8",
@@ -105,87 +106,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " ../data/aircraft-data_nov_dec.csv : reg_city\n",
       " ../data/aircraft-data_nov_dec.csv : reg_state\n",
-      " ../data/aircraft-data_nov_dec.csv : tail_number\n",
+      " ../data/aircraft-data_nov_dec.csv : reg_city\n",
       " ../data/aircraft-data_nov_dec.csv : flight\n",
+      " ../data/aircraft-data_nov_dec.csv : tail_number\n",
       " ../data/aircraft-data_nov_dec.csv : reg_expiration\n",
-      " ../data/aircraft-data_nov_dec.csv : reg_owner\n",
       " ../data/aircraft-data_nov_dec.csv : manufacturer\n",
+      " ../data/aircraft-data_nov_dec.csv : reg_owner\n",
       " ../data/aircraft-data_nov_dec.csv : model\n"
      ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname AG identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.\n",
-      "  warnings.warn(\"tzname {tzname} identified but not understood.  \"\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " ../data/Airplane_Cleaned.csv : TP mods\n",
-      " ../data/Airplane_Cleaned.csv : Multi Engine\n",
-      " ../data/Airplane_Cleaned.csv : Engine Type\n",
-      " ../data/Airplane_Cleaned.csv : Company\n",
-      " ../data/Airplane_Cleaned.csv : Model\n",
-      " ../data/autoscout24-germany-dataset.csv : make\n",
-      " ../data/autoscout24-germany-dataset.csv : gear\n",
-      " ../data/autoscout24-germany-dataset.csv : model\n",
-      " ../data/autoscout24-germany-dataset.csv : offerType\n",
-      " ../data/autoscout24-germany-dataset.csv : fuel\n",
-      " ../data/CARS_1.csv : fuel_type\n",
-      " ../data/CARS_1.csv : transmission_type\n",
-      " ../data/CARS_1.csv : car_name\n",
-      " ../data/CARS_1.csv : body_type\n",
-      " ../data/USA_cars_datasets.csv : model\n",
-      " ../data/USA_cars_datasets.csv : brand\n",
-      " ../data/USA_cars_datasets.csv : country\n",
-      " ../data/USA_cars_datasets.csv : vin\n",
-      " ../data/USA_cars_datasets.csv : title_status\n",
-      " ../data/USA_cars_datasets.csv : condition\n",
-      " ../data/USA_cars_datasets.csv : state\n",
-      " ../data/USA_cars_datasets.csv : color\n",
-      " ../data/imdb_top_1000.csv : Poster_Link\n",
-      " ../data/imdb_top_1000.csv : Gross\n",
-      " ../data/imdb_top_1000.csv : Certificate\n",
-      " ../data/imdb_top_1000.csv : Series_Title\n",
-      " ../data/imdb_top_1000.csv : Star3\n",
-      " ../data/imdb_top_1000.csv : Director\n",
-      " ../data/imdb_top_1000.csv : Star2\n",
-      " ../data/imdb_top_1000.csv : Star1\n",
-      " ../data/imdb_top_1000.csv : Star4\n",
-      " ../data/imdb_top_1000.csv : Overview\n",
-      " ../data/imdb_top_1000.csv : Genre\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\ab032mj\\Desktop\\thesis\\simillarity\\venv\\Lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname ELSIE identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.\n",
-      "  warnings.warn(\"tzname {tzname} identified but not understood.  \"\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " ../data/netflix_titles.csv : show_id\n",
-      " ../data/netflix_titles.csv : title\n",
-      " ../data/netflix_titles.csv : description\n",
-      " ../data/netflix_titles.csv : director\n",
-      " ../data/netflix_titles.csv : cast\n",
-      " ../data/netflix_titles.csv : country\n",
-      " ../data/netflix_titles.csv : rating\n",
-      " ../data/netflix_titles.csv : type\n",
-      " ../data/netflix_titles.csv : duration\n",
-      " ../data/netflix_titles.csv : listed_in\n",
-      " ../data/netflix_titles.csv : date_added\n"
-     ]
     }
    ],
    "source": [
@@ -196,8 +125,8 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-14T14:16:20.221739Z",
-     "start_time": "2024-05-14T14:15:40.068761Z"
+     "end_time": "2024-05-16T09:45:42.139139Z",
+     "start_time": "2024-05-16T09:45:32.026381Z"
     }
    },
    "id": "cfe57003e670ba15",
@@ -232,12 +161,12 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-03-26T09:58:32.218154Z",
-     "start_time": "2024-03-26T09:58:32.213907Z"
+     "end_time": "2024-05-16T09:45:42.145314Z",
+     "start_time": "2024-05-16T09:45:42.140194Z"
     }
    },
    "id": "19c881d9f450b556",
-   "execution_count": 16
+   "execution_count": 5
   },
   {
    "cell_type": "code",
@@ -246,38 +175,34 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Processing column: reg_city1 1.92%\n"
-     ]
-    },
-    {
-     "ename": "ValueError",
-     "evalue": "invalid literal for int() with base 10: 'column2vec_avg'",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[1;31mValueError\u001B[0m                                Traceback (most recent call last)",
-      "Cell \u001B[1;32mIn[5], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m vectors_avg \u001B[38;5;241m=\u001B[39m \u001B[43mget_vectors\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcolumn2vec_avg\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdata\u001B[49m\u001B[43m)\u001B[49m\n",
-      "File \u001B[1;32m~\\Desktop\\thesis\\simillarity\\column2Vec\\functions.py:61\u001B[0m, in \u001B[0;36mget_vectors\u001B[1;34m(function, data)\u001B[0m\n\u001B[0;32m     59\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m key \u001B[38;5;129;01min\u001B[39;00m data:\n\u001B[0;32m     60\u001B[0m     \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mProcessing column: \u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m+\u001B[39m key \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m \u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m+\u001B[39m \u001B[38;5;28mstr\u001B[39m(\u001B[38;5;28mround\u001B[39m((count \u001B[38;5;241m/\u001B[39m \u001B[38;5;28mlen\u001B[39m(data)) \u001B[38;5;241m*\u001B[39m \u001B[38;5;241m100\u001B[39m, \u001B[38;5;241m2\u001B[39m)) \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m%\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m---> 61\u001B[0m     result[key] \u001B[38;5;241m=\u001B[39m \u001B[43mfunction\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtrained_model\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_module\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m     62\u001B[0m     count \u001B[38;5;241m+\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m1\u001B[39m\n\u001B[0;32m     63\u001B[0m end \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mtime()\n",
-      "File \u001B[1;32m~\\Desktop\\thesis\\simillarity\\column2Vec\\Column2Vec.py:159\u001B[0m, in \u001B[0;36mcolumn2vec_avg\u001B[1;34m(column, model, key)\u001B[0m\n\u001B[0;32m    157\u001B[0m encoded_columns \u001B[38;5;241m=\u001B[39m model\u001B[38;5;241m.\u001B[39mencode(column_clean)\n\u001B[0;32m    158\u001B[0m to_ret \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mmean(encoded_columns, axis\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m)  \u001B[38;5;66;03m# counts arithmetic mean (average)\u001B[39;00m\n\u001B[1;32m--> 159\u001B[0m \u001B[43mcache\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43msave\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfunction_string\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mto_ret\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m    160\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m to_ret\n",
-      "File \u001B[1;32m~\\Desktop\\thesis\\simillarity\\column2Vec\\Column2Vec.py:46\u001B[0m, in \u001B[0;36mCache.save\u001B[1;34m(self, key, function, embedding)\u001B[0m\n\u001B[0;32m     39\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21msave\u001B[39m(\u001B[38;5;28mself\u001B[39m, key: \u001B[38;5;28mstr\u001B[39m, function: \u001B[38;5;28mstr\u001B[39m, embedding: \u001B[38;5;28mlist\u001B[39m):\n\u001B[0;32m     40\u001B[0m \u001B[38;5;250m    \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m     41\u001B[0m \u001B[38;5;124;03m    Saves cache\u001B[39;00m\n\u001B[0;32m     42\u001B[0m \u001B[38;5;124;03m    :param key: Column name\u001B[39;00m\n\u001B[0;32m     43\u001B[0m \u001B[38;5;124;03m    :param function: Function name\u001B[39;00m\n\u001B[0;32m     44\u001B[0m \u001B[38;5;124;03m    :param embedding: to save\u001B[39;00m\n\u001B[0;32m     45\u001B[0m \u001B[38;5;124;03m    \"\"\"\u001B[39;00m\n\u001B[1;32m---> 46\u001B[0m     \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m|\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28;43mint\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mfunction\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m| : |\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mint\u001B[39m(key)\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m|\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m     47\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m__cache\u001B[38;5;241m.\u001B[39mat[function, key] \u001B[38;5;241m=\u001B[39m embedding\n",
-      "\u001B[1;31mValueError\u001B[0m: invalid literal for int() with base 10: 'column2vec_avg'"
+      "Processing column: reg_state1 12.5%\n",
+      "Processing column: reg_city1 25.0%\n",
+      "Processing column: flight1 37.5%\n",
+      "Processing column: tail_number1 50.0%\n",
+      "Processing column: reg_expiration1 62.5%\n",
+      "Processing column: manufacturer1 75.0%\n",
+      "Processing column: reg_owner1 87.5%\n",
+      "Processing column: model1 100.0%\n",
+      "ELAPSED TIME :0.005300045013427734\n"
      ]
     }
    ],
    "source": [
-    "\n",
-    "\n",
-    "vectors_avg = get_vectors(column2vec_avg, data)"
+    "from column2Vec.Column2Vec import cache\n",
+    "cache.off()\n",
+    "vectors_avg = get_vectors(column2vec_avg, data)\n",
+    "# vectors_avg2 = get_vectors(column2vec_avg, data)\n",
+    "# cache.save_persistently()"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-05-14T14:16:23.159447Z",
-     "start_time": "2024-05-14T14:16:20.222742Z"
+     "end_time": "2024-05-16T09:45:42.163420Z",
+     "start_time": "2024-05-16T09:45:42.145921Z"
     }
    },
    "id": "d18443a1c921f509",
-   "execution_count": 5
+   "execution_count": 6
   },
   {
    "cell_type": "markdown",