-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
iggy
committed
Jan 19, 2023
1 parent
0e4ea30
commit 54c97fd
Showing
11 changed files
with
2,164 additions
and
217 deletions.
There are no files selected for viewing
313 changes: 313 additions & 0 deletions
313
...r-examples/.ipynb_checkpoints/0. Data Transform for Buffalo's input data-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,313 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Download movielens 1M dataset(https://grouplens.org/datasets/movielens/)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Archive: ml-1m.zip\n", | ||
" creating: data/ml-1m/\n", | ||
" inflating: data/ml-1m/movies.dat \n", | ||
" inflating: data/ml-1m/ratings.dat \n", | ||
" inflating: data/ml-1m/README \n", | ||
" inflating: data/ml-1m/users.dat \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!mkdir data\n", | ||
"!wget -q http://www.grouplens.org/system/files/ml-1m.zip ./data\n", | ||
"!unzip -o ml-1m -d data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Transform ml-1m dataset into Matrix Market Form" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"If you are not familiar with mm(matrix market) format, refer [this](http://networkrepository.com/mtx-matrix-market-format.html)\n", | ||
"\n", | ||
"If you need to know further on how buffalo handle data, check [Documentation on database of Buffalo](https://buffalo-recsys.readthedocs.io/en/latest/intro.html#database)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"from scipy.io import mmwrite\n", | ||
"from scipy.io import mmread\n", | ||
"from scipy.sparse import csr_matrix" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ratings = pd.read_csv(\"data/ml-1m/ratings.dat\", header=None, sep=\"::\", engine='python')\n", | ||
"ratings.columns = [\"uid\", \"iid\", \"rating\", \"timestamp\"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"movies = pd.read_csv('data/ml-1m/movies.dat', header=None, sep=\"::\", engine='python', encoding='latin-1')\n", | ||
"movies.columns = ['iid', 'movie_name', 'genre']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"buffalo iid does not support string with utf-8 encoding and having spaces.\n", | ||
"\n", | ||
"Therefore, we have to replace spaces and utf-8 text." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def parse_moviename(movie_name):\n", | ||
" return movie_name.replace(' ', '_').encode('utf-8').decode('ascii', 'ignore')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"iid_to_movie_name = dict(zip(movies.iid.tolist(), movies.movie_name.tolist()))\n", | ||
"iid_to_movie_name = {iid: parse_moviename(movie_name) for (iid, movie_name) in iid_to_movie_name.items()}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"uid_to_idx = {uid: idx for (idx, uid) in enumerate(ratings.uid.unique().tolist())}\n", | ||
"iid_to_idx = {iid: idx for (idx, iid) in enumerate(ratings.iid.unique().tolist())}\n", | ||
"idx_to_movie_name = {idx:iid_to_movie_name[iid] for (iid, idx) in iid_to_idx.items()}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Examples of movie names\n", | ||
"\n", | ||
"[index 30] movie_name: Antz_(1998)\n", | ||
"[index 31] movie_name: Girl,_Interrupted_(1999)\n", | ||
"[index 32] movie_name: Hercules_(1997)\n", | ||
"[index 33] movie_name: Aladdin_(1992)\n", | ||
"[index 34] movie_name: Mulan_(1998)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(\"Examples of movie names\\n\")\n", | ||
"\n", | ||
"for i in range(30, 35):\n", | ||
" print(\"[index %d] movie_name: %s\" % (i, idx_to_movie_name[i]))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"row, col, dat = ratings.uid.tolist(), ratings.iid.tolist(), ratings.rating.tolist()\n", | ||
"row = [uid_to_idx[r] for r in row]\n", | ||
"col = [iid_to_idx[c] for c in col]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"train_matrix = csr_matrix((dat, (row,col)), shape=(1 + np.max(row), 1 + np.max(col)))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"(6040, 3706)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(train_matrix.shape)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### To transform csr matrix into matrix market format easily, we use mmwrite (matrix market write)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"mmwrite('data/ml-1m/main', train_matrix)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"with open(\"data/ml-1m/uid\", \"w\") as f:\n", | ||
" for uid in uid_to_idx:\n", | ||
" print(uid, file=f)\n", | ||
"\n", | ||
"with open(\"data/ml-1m/iid\", \"w\") as f:\n", | ||
" for iid, movie_name in idx_to_movie_name.items():\n", | ||
" print(movie_name, file=f)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Transform ml-1m dataset into Stream format" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Stream file format used in buffalo contains lines lists, having space as delimiter.\n", | ||
"\n", | ||
"One line is ordered list of items that each user interacted (ordered by time)\n", | ||
"\n", | ||
"This is useful when the order between interactions are considered(e.g., word2vec, Cofactor).\n", | ||
"\n", | ||
"See `2. Cofactor` or `3. Word2vec` to see the case where Stream format data is used\n", | ||
"\n", | ||
"If you need to know further on Stream format data, check [Documentation on database of Buffalo](https://buffalo-recsys.readthedocs.io/en/latest/intro.html#database)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ratings_as_list = ratings.sort_values(by='timestamp').groupby('uid').iid.apply(list).reset_index()\n", | ||
"uid = ratings_as_list.uid.tolist()\n", | ||
"seen_iids = ratings_as_list.iid.tolist()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"seen_iids = [' '.join([iid_to_movie_name[iid] for iid in iids]) for iids in seen_iids]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Girl,_Interrupted_(1999) Titanic_(1997) Back_to_the_Future_(1985) Cinderella_(1950) Meet_Joe_Black_(1998) Last_Days_of_Disco,_The_(1998) Erin_Brockovich_(2000) To_Kill_a_Mockingbird_(1962) Christmas_Story,_A_(1983) Star_Wars:_Episode_IV_-_A_New_Hope_(1977) Wallace_&_Gromit:_The_Best_of_Aardman_Animation_(1996) One_Flew_Over_the_Cuckoo's_Nest_(1975) Wizard_of_Oz,_The_(1939) Fargo_(1996) Run_Lola_Run_(Lola_rennt)_(1998) Rain_Man_(1988) Saving_Private_Ryan_(1998) Awakenings_(1990) Gigi_(1958) Sound_of_Music,_The_(1965) Driving_Miss_Daisy_(1989) Mary_Poppins_(1964) Bambi_(1942) Apollo_13_(1995) E.T._the_Extra-Terrestrial_(1982) My_Fair_Lady_(1964) Ben-Hur_(1959) Big_(1988) Dead_Poets_Society_(1989) Sixth_Sense,_The_(1999) James_and_the_Giant_Peach_(1996) Ferris_Bueller's_Day_Off_(1986) Secret_Garden,_The_(1993) Toy_Story_2_(1999) Airplane!_(1980) Dumbo_(1941) Pleasantville_(1998) Princess_Bride,_The_(1987) Snow_White_and_the_Seven_Dwarfs_(1937) Miracle_on_34th_Street_(1947) Ponette_(1996) Schindler's_List_(1993) Close_Shave,_A_(1995) Beauty_and_the_Beast_(1991) Aladdin_(1992) Toy_Story_(1995) Tarzan_(1999) Hunchback_of_Notre_Dame,_The_(1996) Antz_(1998) Bug's_Life,_A_(1998) Mulan_(1998) Hercules_(1997) Pocahontas_(1995)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(seen_iids[0])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 23, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"with open(\"data/ml-1m/stream\", \"w\") as f:\n", | ||
" for iid_list in seen_iids:\n", | ||
" print(iid_list, file=f)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.