From f0e36cdf667908c720de4ca95191f7276a95eb24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=AA=20Trung=20Ho=C3=A0ng?= Date: Mon, 18 Dec 2023 03:14:45 +0700 Subject: [PATCH] Add Temporal-Item-Frequency-based User-KNN (TIFUKNN) model (#566) * Add Temporal-Item-Frequency-based User-KNN (TIFUKNN) model for next basket recommendation * replace KNN with KDTree * refactor code * Fix scoring function using the average neighbors vectors --- README.md | 1 + cornac/models/__init__.py | 1 + cornac/models/tifuknn/__init__.py | 16 +++ cornac/models/tifuknn/recom_tifuknn.py | 171 +++++++++++++++++++++++++ examples/README.md | 2 + examples/tifuknn_tafeng.py | 45 +++++++ 6 files changed, 236 insertions(+) create mode 100644 cornac/models/tifuknn/__init__.py create mode 100644 cornac/models/tifuknn/recom_tifuknn.py create mode 100644 examples/tifuknn_tafeng.py diff --git a/README.md b/README.md index 6bea6ec24..07a8fcdaa 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,7 @@ The recommender models supported by Cornac are listed below. Why don't you join | 2020 | [Adversarial Training Towards Robust Multimedia Recommender System (AMR)](cornac/models/amr), [paper](https://ieeexplore.ieee.org/document/8618394) | [requirements.txt](cornac/models/amr/requirements.txt) | [amr_clothing.py](examples/amr_clothing.py) | | [Hybrid neural recommendation with joint deep representation learning of ratings and reviews (HRDR)](cornac/models/hrdr), [paper](https://www.sciencedirect.com/science/article/abs/pii/S0925231219313207) | [requirements.txt](cornac/models/hrdr/requirements.txt) | [hrdr_example.py](examples/hrdr_example.py) | | [LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation](cornac/models/lightgcn), [paper](https://arxiv.org/pdf/2002.02126.pdf) | [requirements.txt](cornac/models/lightgcn/requirements.txt) | [lightgcn_example.py](examples/lightgcn_example.py) +| | [Temporal-Item-Frequency-based User-KNN (TIFUKNN)](cornac/models/tifuknn), [paper](https://arxiv.org/pdf/2006.00556.pdf) | N/A | [tifuknn_tafeng.py](examples/tifuknn_tafeng.py) | 2019 | [Embarrassingly Shallow Autoencoders for Sparse Data (EASEᴿ)](cornac/models/ease), [paper](https://arxiv.org/pdf/1905.03375.pdf) | N/A | [ease_movielens.py](examples/ease_movielens.py) | | [Neural Graph Collaborative Filtering (NGCF)](cornac/models/ngcf), [paper](https://arxiv.org/pdf/1905.08108.pdf) | [requirements.txt](cornac/models/ngcf/requirements.txt) | [ngcf_example.py](examples/ngcf_example.py) | 2018 | [Collaborative Context Poisson Factorization (C2PF)](cornac/models/c2pf), [paper](https://www.ijcai.org/proceedings/2018/0370.pdf) | N/A | [c2pf_exp.py](examples/c2pf_example.py) diff --git a/cornac/models/__init__.py b/cornac/models/__init__.py index 0367f5e86..23a626e9a 100644 --- a/cornac/models/__init__.py +++ b/cornac/models/__init__.py @@ -68,6 +68,7 @@ from .skm import SKMeans from .sorec import SoRec from .svd import SVD +from .tifuknn import TIFUKNN from .trirank import TriRank from .vaecf import VAECF from .vbpr import VBPR diff --git a/cornac/models/tifuknn/__init__.py b/cornac/models/tifuknn/__init__.py new file mode 100644 index 000000000..95ffe08aa --- /dev/null +++ b/cornac/models/tifuknn/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2023 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from .recom_tifuknn import TIFUKNN diff --git a/cornac/models/tifuknn/recom_tifuknn.py b/cornac/models/tifuknn/recom_tifuknn.py new file mode 100644 index 000000000..0bf57c3d3 --- /dev/null +++ b/cornac/models/tifuknn/recom_tifuknn.py @@ -0,0 +1,171 @@ +# Copyright 2023 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import warnings +from time import time + +import numpy as np +from tqdm import tqdm + +from ..recommender import NextBasketRecommender + + +class TIFUKNN(NextBasketRecommender): + """Temporal-Item-Frequency-based User-KNN (TIFUKNN) + + Parameters + ---------- + name: string, default: 'TIFUKNN' + The name of the recommender model. + + n_neighbors: int, optional, default: 300 + The number of neighbors for KNN + + within_decay_rate: float, optional, default: 0.9 + Within-basket time-decayed ratio in range [0, 1] + + group_decay_rate: float, optional, default: 0.7 + Group time-decayed ratio in range [0, 1] + + alpha: float, optional, default: 0.7 + The trade-off between current user vector and neighbors vectors + to compute final item scores + + n_groups: int, optional, default: 7 + The historal baskets will be partition into `n_groups` equally. + + verbose: boolean, optional, default: False + When True, running logs are displayed. + + References + ---------- + Haoji Hu, Xiangnan He, Jinyang Gao, and Zhi-Li Zhang. 2020. + Modeling Personalized Item Frequency Information for Next-basket Recommendation. + In Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR '20). Association for Computing Machinery, New York, NY, USA, 1071–1080. https://doi.org/10.1145/3397271.3401066 + + """ + + def __init__( + self, + name="TIFUKNN", + n_neighbors=300, + within_decay_rate=0.9, + group_decay_rate=0.7, + alpha=0.7, + n_groups=7, + verbose=False, + ): + super().__init__(name=name, trainable=False, verbose=verbose) + assert within_decay_rate >= 0 and within_decay_rate <= 1 + assert group_decay_rate >= 0 and group_decay_rate <= 1 + self.n_neighbors = n_neighbors + self.within_decay_rate = within_decay_rate + self.group_decay_rate = group_decay_rate + self.alpha = alpha + self.n_groups = n_groups + + def fit(self, train_set, val_set=None): + from scipy.spatial import KDTree + + super().fit(train_set=train_set, val_set=val_set) + self.user_vectors = self._get_user_vectors(self.train_set) + if self.n_neighbors > len(self.user_vectors): + warnings.warn("Number of users is %d, smaller than number of neighbors %d" % (len(self.user_vectors), self.n_neighbors)) + self.n_neighbors = len(self.user_vectors) + + start_time = time() + if self.verbose: + print("Constructing kd-tree for quick nearest-neighbor lookup") + self.tree = KDTree(self.user_vectors) + if self.verbose: + print("Constructing kd-tree for quick nearest-neighbor lookup takes %.0f" % (time() - start_time)) + return self + + def _get_user_vectors(self, data_set): + user_vectors = [] + for _, _, [basket_items] in tqdm( + data_set.ubi_iter(batch_size=1, shuffle=False), + desc="Getting user vectors", + total=data_set.num_users, + ): + user_vectors.append(self._compute_user_vector(basket_items[:-1])) + user_vectors = np.asarray(user_vectors, dtype="float32") + return user_vectors + + def _compute_user_vector(self, history_baskets): + his_list = [] + n_baskets = len(history_baskets) + for inc, iids in enumerate(history_baskets): + his_vec = np.zeros(self.total_items, dtype="float32") + decayed_val = np.power(self.within_decay_rate, n_baskets - inc - 1) + for iid in iids: + his_vec[iid] = decayed_val + his_list.append(his_vec) + grouped_list, real_n_groups = self._group_history_list(his_list, self.n_groups) + his_vec = np.zeros(self.total_items, dtype="float32") + if real_n_groups == 0: + return his_vec + + for idx in range(real_n_groups): + decayed_val = np.power(self.group_decay_rate, self.n_groups - idx - 1) + his_vec += grouped_list[idx] * decayed_val + + return his_vec / real_n_groups + + def _group_history_list(self, his_list, n_groups): + grouped_vec_list = [] + if len(his_list) < n_groups: + for j in range(len(his_list)): + grouped_vec_list.append(his_list[j]) + return grouped_vec_list, len(his_list) + else: + est_num_vec_each_block = len(his_list) / n_groups + base_num_vec_each_block = int(np.floor(len(his_list) / n_groups)) + residual = est_num_vec_each_block - base_num_vec_each_block + + num_vec_has_extra_vec = int(np.round(residual * n_groups)) + + if residual == 0: + for i in range(n_groups): + sum = np.zeros(len(his_list[0])) + for j in range(base_num_vec_each_block): + sum += his_list[i * base_num_vec_each_block + j] + grouped_vec_list.append(sum / base_num_vec_each_block) + else: + for i in range(n_groups - num_vec_has_extra_vec): + sum = np.zeros(len(his_list[0])) + for j in range(base_num_vec_each_block): + sum += his_list[i * base_num_vec_each_block + j] + last_idx = i * base_num_vec_each_block + j + grouped_vec_list.append(sum / base_num_vec_each_block) + + est_num = int(np.ceil(est_num_vec_each_block)) + start_group_idx = n_groups - num_vec_has_extra_vec + if len(his_list) - start_group_idx * base_num_vec_each_block >= est_num_vec_each_block: + for i in range(start_group_idx, n_groups): + sum = np.zeros(len(his_list[0])) + for j in range(est_num): + iidxx = last_idx + 1 + (i - start_group_idx) * est_num + j + sum += his_list[iidxx] + grouped_vec_list.append(sum / est_num) + + return grouped_vec_list, n_groups + + def score(self, user_idx, history_baskets, **kwargs): + if len(history_baskets) == 0: + return np.zeros(self.total_items, dtype="float32") + user_vector = self._compute_user_vector(history_baskets) + _, indices = self.tree.query([user_vector], k=self.n_neighbors) + return self.alpha * user_vector + (1 - self.alpha) * np.mean(self.user_vectors[indices.squeeze()]) diff --git a/examples/README.md b/examples/README.md index 1a7058027..5ed367552 100644 --- a/examples/README.md +++ b/examples/README.md @@ -109,3 +109,5 @@ ## Next-Basket Algorithms [gp_top_tafeng.py](gp_top_tafeng.py) - Next-basket recommendation model that merely uses item top frequency. + +[tifuknn_tafeng.py](tifuknn_tafeng.py) - Example of Temporal-Item-Frequency-based User-KNN (TIFUKNN). diff --git a/examples/tifuknn_tafeng.py b/examples/tifuknn_tafeng.py new file mode 100644 index 000000000..07fbf07d8 --- /dev/null +++ b/examples/tifuknn_tafeng.py @@ -0,0 +1,45 @@ +# Copyright 2023 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Example of Temporal-Item-Frequency-based User-KNN (TIFUKNN)""" + +import cornac +from cornac.eval_methods import NextBasketEvaluation +from cornac.metrics import NDCG, HitRatio, Recall +from cornac.models import TIFUKNN + +data = cornac.datasets.tafeng.load_basket(reader=cornac.data.Reader(min_basket_size=3, max_basket_size=50, min_basket_sequence=2)) + +next_basket_eval = NextBasketEvaluation(data=data, fmt="UBITJson", test_size=0.2, val_size=0.08, seed=123, verbose=True) + +models = [ + TIFUKNN( + n_neighbors=300, + within_decay_rate=0.9, + group_decay_rate=0.7, + alpha=0.7, + n_groups=7, + ) +] + +metrics = [ + Recall(k=10), + Recall(k=50), + NDCG(k=10), + NDCG(k=50), + HitRatio(k=10), + HitRatio(k=50), +] + +cornac.Experiment(eval_method=next_basket_eval, models=models, metrics=metrics).run()