diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c285e93f..4f8e84c0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,7 +30,7 @@ repos: name: Security check (bandit) entry: bandit types: [python] - args: ["--recursive", "lib/"] + args: ["-x", "tests", --recursive, choice_learn] language: system - id: pytest-check name: Tests (pytest) diff --git a/choice_learn/__init__.py b/choice_learn/__init__.py new file mode 100644 index 00000000..0f7eda1f --- /dev/null +++ b/choice_learn/__init__.py @@ -0,0 +1 @@ +"""Choice-Learn library for Python.""" diff --git a/choice_learn/data/__init__.py b/choice_learn/data/__init__.py new file mode 100644 index 00000000..ee23245a --- /dev/null +++ b/choice_learn/data/__init__.py @@ -0,0 +1 @@ +"""Data handling classes and functions.""" diff --git a/lib/data/choice_dataset.py b/choice_learn/data/choice_dataset.py similarity index 91% rename from lib/data/choice_dataset.py rename to choice_learn/data/choice_dataset.py index a2fba478..14716252 100644 --- a/lib/data/choice_dataset.py +++ b/choice_learn/data/choice_dataset.py @@ -2,8 +2,9 @@ import numpy as np import pandas as pd -from choice_modeling.data.indexer import ChoiceDatasetIndexer -from choice_modeling.data.store import Store + +from choice_learn.data.indexer import ChoiceDatasetIndexer +from choice_learn.data.store import Store class ChoiceDataset(object): @@ -686,7 +687,39 @@ def save(self): def summary(self): """Method to display a summary of the dataset.""" - raise NotImplementedError + print("Summary of the dataset:") + print("Number of items:", self.get_num_items()) + print("Number of sessions:", self.get_num_sessions()) + print( + "Number of choices:", + self.get_num_choices(), + "Averaging", + self.get_num_choices() / self.get_num_sessions(), + "choices per session", + ) + if self.items_features is not None: + print(f"Items features: {self.items_features_names}") + if self.items_features is not None: + print(f"{sum([f.shape[1] for f in self.items_features])} items features") + else: + print("No items features registered") + + if self.sessions_features is not None: + print(f"Sessions features: {self.sessions_features_names}") + if self.sessions_features is not None: + print(f"{sum([f.shape[1] for f in self.sessions_features])} session features") + else: + print("No sessions features registered") + + if self.sessions_featuresitems_features is not None: + print(f"Session Items features: {self.sessions_items_features_names}") + if self.sessions_items_features is not None: + print( + f"{sum([f.shape[2] for f in self.sessions_items_features])} sessions \ + items features" + ) + else: + print("No sessions items features registered") def get_choice_batch(self, choice_index): """Method to access data within the ListChoiceDataset from its index. @@ -845,7 +878,7 @@ def __getitem__(self, session_indexes): sessions_items_features_names=self.sessions_items_features_names, ) - def batch(self, batch_size=None, shuffle=None, sample_weight=None): + def old_batch(self, batch_size=None, shuffle=None, sample_weight=None): """Iterates over dataset return batches of length self.batch_size. Parameters @@ -892,6 +925,50 @@ def batch(self, batch_size=None, shuffle=None, sample_weight=None): yielded_size += 2 * num_choices @property - def iloc(self): + def batch(self): """Indexer.""" return self.indexer + + def iter_batch(self, batch_size=None, shuffle=None, sample_weight=None): + """Iterates over dataset return batches of length self.batch_size. + + Newer version. + + Parameters + ---------- + batch_size : int + batch size to set + shuffle: bool + Whether or not to shuffle the dataset + sample_weight : Iterable + list of weights to be returned with the right indexing during the shuffling + """ + if batch_size is None: + batch_size = self.batch_size + if shuffle is None: + shuffle = self.shuffle + if batch_size == -1: + batch_size = self.get_num_choices() + + # Get indexes for each choice + num_choices = self.get_num_choices() + indexes = np.arange(num_choices) + # Shuffle indexes + if shuffle and not batch_size == -1: + indexes = np.random.permutation(indexes) + + yielded_size = 0 + while yielded_size < num_choices: + # Return sample_weight if not None, for index matching + if sample_weight is not None: + yield ( + self.batch[indexes[yielded_size : yielded_size + batch_size].tolist()], + sample_weight[indexes[yielded_size : yielded_size + batch_size].tolist()], + ) + else: + yield self.batch[indexes[yielded_size : yielded_size + batch_size].tolist()] + yielded_size += batch_size + + # Special exit strategy for batch_size = -1 + if batch_size == -1: + yielded_size += 2 * num_choices diff --git a/lib/data/indexer.py b/choice_learn/data/indexer.py similarity index 99% rename from lib/data/indexer.py rename to choice_learn/data/indexer.py index 0bc2de80..f2887581 100644 --- a/lib/data/indexer.py +++ b/choice_learn/data/indexer.py @@ -60,7 +60,7 @@ def __getitem__(self, sequence_index): if isinstance(sequence_index, slice): return [ self.store.store[self.store.sequence[i]] - for i in range(*sequence_index.indices(len(self.sequence))) + for i in range(*sequence_index.indices(len(self.store.sequence))) ] return self.store.store[self.store.sequence[sequence_index]] diff --git a/lib/data/store.py b/choice_learn/data/store.py similarity index 98% rename from lib/data/store.py rename to choice_learn/data/store.py index 5680943f..c0a13b43 100644 --- a/lib/data/store.py +++ b/choice_learn/data/store.py @@ -1,6 +1,7 @@ """Different classes to optimize RAM usage with repeated features over time.""" import numpy as np -from choice_modeling.data.indexer import OneHotStoreIndexer, StoreIndexer + +from choice_learn.data.indexer import OneHotStoreIndexer, StoreIndexer class Store(object): @@ -22,7 +23,7 @@ def __init__(self, indexes=None, values=None, sequence=None, name=None, indexer= name of the features store -- not used at the moment """ if indexes is None: - indexes = list(range(values)) + indexes = list(range(len(values))) self.store = {k: v for (k, v) in zip(indexes, values)} self.sequence = np.array(sequence) self.name = name @@ -62,7 +63,7 @@ def __len__(self): return len(self.sequence) @property - def iloc(self): + def batch(self): """Indexing attribute.""" return self.indexer diff --git a/choice_learn/models/__init__.py b/choice_learn/models/__init__.py new file mode 100644 index 00000000..c5e79129 --- /dev/null +++ b/choice_learn/models/__init__.py @@ -0,0 +1 @@ +"""Models classes and functions.""" diff --git a/lib/models/base_model.py b/choice_learn/models/base_model.py similarity index 100% rename from lib/models/base_model.py rename to choice_learn/models/base_model.py diff --git a/lib/models/conditional_mnl.py b/choice_learn/models/conditional_mnl.py similarity index 100% rename from lib/models/conditional_mnl.py rename to choice_learn/models/conditional_mnl.py diff --git a/lib/models/rumnet.py b/choice_learn/models/rumnet.py similarity index 99% rename from lib/models/rumnet.py rename to choice_learn/models/rumnet.py index 2b51837b..4436b62c 100644 --- a/lib/models/rumnet.py +++ b/choice_learn/models/rumnet.py @@ -1,6 +1,7 @@ """Implementation of RUMnet for easy use.""" import tensorflow as tf -from choice_modeling.models.base_model import ChoiceModel + +from choice_learn.models.base_model import ChoiceModel class PaperRUMnet(ChoiceModel): diff --git a/lib/tf_ops.py b/choice_learn/tf_ops.py similarity index 100% rename from lib/tf_ops.py rename to choice_learn/tf_ops.py diff --git a/lib/.gitkeep b/lib/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/pyproject.toml b/pyproject.toml index 436216d0..cc399e36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ requires-python = ">=3.8" "Documentation" = "https://artefactory.github.io/choice-learn-private" [tool.setuptools] -packages = ["lib", "config", "tests"] +packages = ["choice_learn", "config", "tests"] [tool.ruff] select = [ @@ -62,4 +62,7 @@ convention = "google" quote-style = "double" [tool.ruff.isort] -known-first-party = ["lib", "config", "tests"] +known-first-party = ["choice_learn", "config", "tests"] + +[tool.bandit] +exclude_dirs = ["tests/"] diff --git a/tests/unit_tests/.gitkeep b/tests/unit_tests/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit_tests/data/test_store.py b/tests/unit_tests/data/test_store.py new file mode 100644 index 00000000..2f57e86e --- /dev/null +++ b/tests/unit_tests/data/test_store.py @@ -0,0 +1,23 @@ +"""Test the store module.""" +from choice_learn.data.store import Store + + +def test_len_store(): + """Test the __len__ method of Store.""" + store = Store(values=[1, 2, 3, 4], sequence=[0, 1, 2, 3, 0, 1, 2, 3]) + assert len(store) == 8 + + +def test_get_store_element(): + """Test the _get_store_element method of Store.""" + store = Store(values=[1, 2, 3, 4], sequence=[0, 1, 2, 3, 0, 1, 2, 3]) + assert store._get_store_element(0) == 1 + assert store._get_store_element([0, 1, 2]) == [1, 2, 3] + + +def test_store_batch(): + """Test the batch method of Store.""" + store = Store(values=[1, 2, 3, 4], sequence=[0, 1, 2, 3, 0, 1, 2, 3]) + assert store.batch[1] == 2 + assert store.batch[2:4] == [3, 4] + assert store.batch[[2, 3, 6, 7]] == [3, 4, 3, 4] diff --git a/tests/unit_tests/test_placeholder.py b/tests/unit_tests/test_placeholder.py deleted file mode 100644 index 338a8e05..00000000 --- a/tests/unit_tests/test_placeholder.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Placeholder test file for unit tests. To be replaced with actual tests.""" - - -def test_placeholder() -> None: - """To be replaced with actual tests.""" - pass