From f0e36cdf667908c720de4ca95191f7276a95eb24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=AA=20Trung=20Ho=C3=A0ng?=
 <lthoang@users.noreply.github.com>
Date: Mon, 18 Dec 2023 03:14:45 +0700
Subject: [PATCH] Add Temporal-Item-Frequency-based User-KNN (TIFUKNN) model
 (#566)

* Add Temporal-Item-Frequency-based User-KNN (TIFUKNN) model for next basket recommendation

* replace KNN with KDTree

* refactor code

* Fix scoring function using the average neighbors vectors
---
 README.md                              |   1 +
 cornac/models/__init__.py              |   1 +
 cornac/models/tifuknn/__init__.py      |  16 +++
 cornac/models/tifuknn/recom_tifuknn.py | 171 +++++++++++++++++++++++++
 examples/README.md                     |   2 +
 examples/tifuknn_tafeng.py             |  45 +++++++
 6 files changed, 236 insertions(+)
 create mode 100644 cornac/models/tifuknn/__init__.py
 create mode 100644 cornac/models/tifuknn/recom_tifuknn.py
 create mode 100644 examples/tifuknn_tafeng.py

diff --git a/README.md b/README.md
index 6bea6ec24..07a8fcdaa 100644
--- a/README.md
+++ b/README.md
@@ -152,6 +152,7 @@ The recommender models supported by Cornac are listed below. Why don't you join
 | 2020 | [Adversarial Training Towards Robust Multimedia Recommender System (AMR)](cornac/models/amr), [paper](https://ieeexplore.ieee.org/document/8618394) | [requirements.txt](cornac/models/amr/requirements.txt) | [amr_clothing.py](examples/amr_clothing.py)
 |      | [Hybrid neural recommendation with joint deep representation learning of ratings and reviews (HRDR)](cornac/models/hrdr), [paper](https://www.sciencedirect.com/science/article/abs/pii/S0925231219313207) | [requirements.txt](cornac/models/hrdr/requirements.txt) | [hrdr_example.py](examples/hrdr_example.py)
 |      | [LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation](cornac/models/lightgcn), [paper](https://arxiv.org/pdf/2002.02126.pdf) | [requirements.txt](cornac/models/lightgcn/requirements.txt) | [lightgcn_example.py](examples/lightgcn_example.py)
+|      | [Temporal-Item-Frequency-based User-KNN (TIFUKNN)](cornac/models/tifuknn), [paper](https://arxiv.org/pdf/2006.00556.pdf) | N/A | [tifuknn_tafeng.py](examples/tifuknn_tafeng.py)
 | 2019 | [Embarrassingly Shallow Autoencoders for Sparse Data (EASEᴿ)](cornac/models/ease), [paper](https://arxiv.org/pdf/1905.03375.pdf) | N/A | [ease_movielens.py](examples/ease_movielens.py)
 |      | [Neural Graph Collaborative Filtering (NGCF)](cornac/models/ngcf), [paper](https://arxiv.org/pdf/1905.08108.pdf) | [requirements.txt](cornac/models/ngcf/requirements.txt) | [ngcf_example.py](examples/ngcf_example.py)
 | 2018 | [Collaborative Context Poisson Factorization (C2PF)](cornac/models/c2pf), [paper](https://www.ijcai.org/proceedings/2018/0370.pdf) | N/A | [c2pf_exp.py](examples/c2pf_example.py)
diff --git a/cornac/models/__init__.py b/cornac/models/__init__.py
index 0367f5e86..23a626e9a 100644
--- a/cornac/models/__init__.py
+++ b/cornac/models/__init__.py
@@ -68,6 +68,7 @@
 from .skm import SKMeans
 from .sorec import SoRec
 from .svd import SVD
+from .tifuknn import TIFUKNN
 from .trirank import TriRank
 from .vaecf import VAECF
 from .vbpr import VBPR
diff --git a/cornac/models/tifuknn/__init__.py b/cornac/models/tifuknn/__init__.py
new file mode 100644
index 000000000..95ffe08aa
--- /dev/null
+++ b/cornac/models/tifuknn/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2023 The Cornac Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+from .recom_tifuknn import TIFUKNN
diff --git a/cornac/models/tifuknn/recom_tifuknn.py b/cornac/models/tifuknn/recom_tifuknn.py
new file mode 100644
index 000000000..0bf57c3d3
--- /dev/null
+++ b/cornac/models/tifuknn/recom_tifuknn.py
@@ -0,0 +1,171 @@
+# Copyright 2023 The Cornac Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import warnings
+from time import time
+
+import numpy as np
+from tqdm import tqdm
+
+from ..recommender import NextBasketRecommender
+
+
+class TIFUKNN(NextBasketRecommender):
+    """Temporal-Item-Frequency-based User-KNN (TIFUKNN)
+
+    Parameters
+    ----------
+    name: string, default: 'TIFUKNN'
+        The name of the recommender model.
+
+    n_neighbors: int, optional, default: 300
+        The number of neighbors for KNN
+
+    within_decay_rate: float, optional, default: 0.9
+        Within-basket time-decayed ratio in range [0, 1]
+
+    group_decay_rate: float, optional, default: 0.7
+        Group time-decayed ratio in range [0, 1]
+
+    alpha: float, optional, default: 0.7
+        The trade-off between current user vector and neighbors vectors
+        to compute final item scores
+
+    n_groups: int, optional, default: 7
+        The historal baskets will be partition into `n_groups` equally.
+
+    verbose: boolean, optional, default: False
+        When True, running logs are displayed.
+
+    References
+    ----------
+    Haoji Hu, Xiangnan He, Jinyang Gao, and Zhi-Li Zhang. 2020.
+    Modeling Personalized Item Frequency Information for Next-basket Recommendation.
+    In Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR '20). Association for Computing Machinery, New York, NY, USA, 1071–1080. https://doi.org/10.1145/3397271.3401066
+
+    """
+
+    def __init__(
+        self,
+        name="TIFUKNN",
+        n_neighbors=300,
+        within_decay_rate=0.9,
+        group_decay_rate=0.7,
+        alpha=0.7,
+        n_groups=7,
+        verbose=False,
+    ):
+        super().__init__(name=name, trainable=False, verbose=verbose)
+        assert within_decay_rate >= 0 and within_decay_rate <= 1
+        assert group_decay_rate >= 0 and group_decay_rate <= 1
+        self.n_neighbors = n_neighbors
+        self.within_decay_rate = within_decay_rate
+        self.group_decay_rate = group_decay_rate
+        self.alpha = alpha
+        self.n_groups = n_groups
+
+    def fit(self, train_set, val_set=None):
+        from scipy.spatial import KDTree
+
+        super().fit(train_set=train_set, val_set=val_set)
+        self.user_vectors = self._get_user_vectors(self.train_set)
+        if self.n_neighbors > len(self.user_vectors):
+            warnings.warn("Number of users is %d, smaller than number of neighbors %d" % (len(self.user_vectors), self.n_neighbors))
+            self.n_neighbors = len(self.user_vectors)
+
+        start_time = time()
+        if self.verbose:
+            print("Constructing kd-tree for quick nearest-neighbor lookup")
+        self.tree = KDTree(self.user_vectors)
+        if self.verbose:
+            print("Constructing kd-tree for quick nearest-neighbor lookup takes %.0f" % (time() - start_time))
+        return self
+
+    def _get_user_vectors(self, data_set):
+        user_vectors = []
+        for _, _, [basket_items] in tqdm(
+            data_set.ubi_iter(batch_size=1, shuffle=False),
+            desc="Getting user vectors",
+            total=data_set.num_users,
+        ):
+            user_vectors.append(self._compute_user_vector(basket_items[:-1]))
+        user_vectors = np.asarray(user_vectors, dtype="float32")
+        return user_vectors
+
+    def _compute_user_vector(self, history_baskets):
+        his_list = []
+        n_baskets = len(history_baskets)
+        for inc, iids in enumerate(history_baskets):
+            his_vec = np.zeros(self.total_items, dtype="float32")
+            decayed_val = np.power(self.within_decay_rate, n_baskets - inc - 1)
+            for iid in iids:
+                his_vec[iid] = decayed_val
+            his_list.append(his_vec)
+        grouped_list, real_n_groups = self._group_history_list(his_list, self.n_groups)
+        his_vec = np.zeros(self.total_items, dtype="float32")
+        if real_n_groups == 0:
+            return his_vec
+
+        for idx in range(real_n_groups):
+            decayed_val = np.power(self.group_decay_rate, self.n_groups - idx - 1)
+            his_vec += grouped_list[idx] * decayed_val
+
+        return his_vec / real_n_groups
+
+    def _group_history_list(self, his_list, n_groups):
+        grouped_vec_list = []
+        if len(his_list) < n_groups:
+            for j in range(len(his_list)):
+                grouped_vec_list.append(his_list[j])
+            return grouped_vec_list, len(his_list)
+        else:
+            est_num_vec_each_block = len(his_list) / n_groups
+            base_num_vec_each_block = int(np.floor(len(his_list) / n_groups))
+            residual = est_num_vec_each_block - base_num_vec_each_block
+
+            num_vec_has_extra_vec = int(np.round(residual * n_groups))
+
+            if residual == 0:
+                for i in range(n_groups):
+                    sum = np.zeros(len(his_list[0]))
+                    for j in range(base_num_vec_each_block):
+                        sum += his_list[i * base_num_vec_each_block + j]
+                    grouped_vec_list.append(sum / base_num_vec_each_block)
+            else:
+                for i in range(n_groups - num_vec_has_extra_vec):
+                    sum = np.zeros(len(his_list[0]))
+                    for j in range(base_num_vec_each_block):
+                        sum += his_list[i * base_num_vec_each_block + j]
+                        last_idx = i * base_num_vec_each_block + j
+                    grouped_vec_list.append(sum / base_num_vec_each_block)
+
+                est_num = int(np.ceil(est_num_vec_each_block))
+                start_group_idx = n_groups - num_vec_has_extra_vec
+                if len(his_list) - start_group_idx * base_num_vec_each_block >= est_num_vec_each_block:
+                    for i in range(start_group_idx, n_groups):
+                        sum = np.zeros(len(his_list[0]))
+                        for j in range(est_num):
+                            iidxx = last_idx + 1 + (i - start_group_idx) * est_num + j
+                            sum += his_list[iidxx]
+                        grouped_vec_list.append(sum / est_num)
+
+            return grouped_vec_list, n_groups
+
+    def score(self, user_idx, history_baskets, **kwargs):
+        if len(history_baskets) == 0:
+            return np.zeros(self.total_items, dtype="float32")
+        user_vector = self._compute_user_vector(history_baskets)
+        _, indices = self.tree.query([user_vector], k=self.n_neighbors)
+        return self.alpha * user_vector + (1 - self.alpha) * np.mean(self.user_vectors[indices.squeeze()])
diff --git a/examples/README.md b/examples/README.md
index 1a7058027..5ed367552 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -109,3 +109,5 @@
 ## Next-Basket Algorithms
 
 [gp_top_tafeng.py](gp_top_tafeng.py) - Next-basket recommendation model that merely uses item top frequency.
+
+[tifuknn_tafeng.py](tifuknn_tafeng.py) - Example of Temporal-Item-Frequency-based User-KNN (TIFUKNN).
diff --git a/examples/tifuknn_tafeng.py b/examples/tifuknn_tafeng.py
new file mode 100644
index 000000000..07fbf07d8
--- /dev/null
+++ b/examples/tifuknn_tafeng.py
@@ -0,0 +1,45 @@
+# Copyright 2023 The Cornac Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Example of Temporal-Item-Frequency-based User-KNN (TIFUKNN)"""
+
+import cornac
+from cornac.eval_methods import NextBasketEvaluation
+from cornac.metrics import NDCG, HitRatio, Recall
+from cornac.models import TIFUKNN
+
+data = cornac.datasets.tafeng.load_basket(reader=cornac.data.Reader(min_basket_size=3, max_basket_size=50, min_basket_sequence=2))
+
+next_basket_eval = NextBasketEvaluation(data=data, fmt="UBITJson", test_size=0.2, val_size=0.08, seed=123, verbose=True)
+
+models = [
+    TIFUKNN(
+        n_neighbors=300,
+        within_decay_rate=0.9,
+        group_decay_rate=0.7,
+        alpha=0.7,
+        n_groups=7,
+    )
+]
+
+metrics = [
+    Recall(k=10),
+    Recall(k=50),
+    NDCG(k=10),
+    NDCG(k=50),
+    HitRatio(k=10),
+    HitRatio(k=50),
+]
+
+cornac.Experiment(eval_method=next_basket_eval, models=models, metrics=metrics).run()