sm1lla · Jasperhino · Dec 21, 2023 · Dec 21, 2023 · Jan 13, 2024 · Jan 14, 2024
diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint-test.yml
@@ -0,0 +1,79 @@
+# Run this job on pushes to `main`, and for pull requests. If you don't specify
+# `branches: [main], then this actions runs _twice_ on pull requests, which is
+# annoying.
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      # Cache the installation of Poetry itself, e.g. the next step. This prevents the workflow
+      # from installing Poetry every time, which can be slow. Note the use of the Poetry version
+      # number in the cache key, and the "-0" suffix: this allows you to invalidate the cache
+      # manually if/when you want to upgrade Poetry, or if something goes wrong. This could be
+      # mildly cleaner by using an environment variable, but I don't really care.
+      - name: cache poetry install
+        uses: actions/cache@v2
+        with:
+          path: ~/.local
+          key: poetry-1.7.1
+
+      # Install Poetry. You could do this manually, or there are several actions that do this.
+      # `snok/install-poetry` seems to be minimal yet complete, and really just calls out to
+      # Poetry's default install script, which feels correct. I pin the Poetry version here
+      # because Poetry does occasionally change APIs between versions and I don't want my
+      # actions to break if it does.
+      #
+      # The key configuration value here is `virtualenvs-in-project: true`: this creates the
+      # venv as a `.venv` in your testing directory, which allows the next step to easily
+      # cache it.
+      - uses: snok/install-poetry@v1
+        with:
+          version: 1.7.1
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+
+      # Cache your dependencies (i.e. all the stuff in your `pyproject.toml`). Note the cache
+      # key: if you're using multiple Python versions, or multiple OSes, you'd need to include
+      # them in the cache key. I'm not, so it can be simple and just depend on the poetry.lock.
+      - name: cache deps
+        id: cache-deps
+        uses: actions/cache@v2
+        with:
+          path: .venv
+          key: pydeps-${{ hashFiles('**/poetry.lock') }}
+
+      # Install dependencies. `--no-root` means "install all dependencies but not the project
+      # itself", which is what you want to avoid caching _your_ code. The `if` statement
+      # ensures this only runs on a cache miss.
+      - run: poetry install --no-interaction --no-root
+        if: steps.cache-deps.outputs.cache-hit != 'true'
+
+      # Now install _your_ project. This isn't necessary for many types of projects -- particularly
+      # things like Django apps don't need this. But it's a good idea since it fully-exercises the
+      # pyproject.toml and makes that if you add things like console-scripts at some point that
+      # they'll be installed and working.
+      - run: poetry install --no-interaction
+
+      # And finally run tests. I'm using pytest and all my pytest config is in my `pyproject.toml`
+      # so this line is super-simple. But it could be as complex as you need.
+      - run: poetry run pytest hfs 
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
diff --git a/.gitignore b/.gitignore
@@ -23,7 +23,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/
@@ -79,4 +78,7 @@ results/
 slurm-*.out
 
 # WandB
-wandb
+wandb
+
+# Weird file kgextension generates
+rate_limits.db
diff --git a/README.md b/README.md
@@ -0,0 +1,63 @@
+====================================================
+hfs - A library for hierarchical feature selection
+====================================================
+
+Introduction
+=============
+
+Welcome to the **hfs** repository!👋 
+This library provides several hierarchical feature selection algorithms.
+
+Many real-world settings contain hierarchical relations. While in text mining, words can be ordered in generalization-specialization relationships in bioinformatics the function of genes is often described as a hierarchy. We can make use of these relationships between datasets' features by using special hierarchical feature selection algorithms that reduce redundancy in the data. This can not only make tasks like classification faster but also improve the results. Depending on use case and preference you can choose from lazy and eager hierarchical feature selection algorithms in this library.
+
+Getting Started
+===================================================
+
+1. Installation
+-------------------------------------
+
+The package cannot be installed with pip or conda yet so to create your package, you need to clone the ``hfs`` repository::
+
+    ``git clone https://github.com/hasso-plattner-institute/hfs.git
+
+    Install the environment using::
+
+    ```poetry install```
+
+1. Usage
+-------------------------------------------
+Here is a simple example of how to use one of the hierarchical feature selection algorithms implemented in hfs:
+
+.. code-block:: python
+
+    from hfs import SHSELSelector
+
+    # Initialize selector
+    selector = SHSELSelector(hierarchy)
+
+    # Fit selector and transform data
+    selector.fit(X, y, columns=columns)
+    X_transformed = selector.transform(X)
+
+Documentation
+=============
+
+For detailed information on how to use **hfs**, check out our complete documentation at https://hfs.readthedocs.io. 📖
+
+There you can find not only the API documentation but also more examples, background information on the algorithms we implemented and results for some experiments we performed with them.
+
+Contributing
+============
+
+We welcome contributions! If you would like to contribute to the project, 
+feel free to create a pull request.
+
+Linting and Testing
+```
+poetry run black .
+```
+
+```
+poetry run pytest hfs
+```
+Happy feature selecting!
diff --git a/environment.yml b/environment.yml
diff --git a/examples/eager_learning_example.py b/examples/eager_learning_example.py
@@ -18,8 +18,8 @@
 import networkx as nx
 import numpy as np
 
-from hfs import SHSELSelector
 from hfs.helpers import get_columns_for_numpy_hierarchy
+from hfs.hierarchical_selectors import SHSELSelector
 
 # Example dataset X with 3 samples and 5 features.
 X = np.array(
@@ -65,7 +65,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import BernoulliNB
 
-from hfs.data.data_utils import create_mapping_columns_to_nodes, load_data, process_data
+from hfs.data_utils import create_mapping_columns_to_nodes, load_data, process_data
 from hfs.preprocessing import HierarchicalPreprocessor
 from hfs.shsel import SHSELSelector
 

diff --git a/examples/lazy_learning_example.py b/examples/lazy_learning_example.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+# %%
 """
 Lazy learning
 =====================
@@ -10,11 +11,8 @@
 import networkx as nx
 import numpy as np
 
-from hfs.hip import HIP
-from hfs.hnb import HNB
-from hfs.mr import MR
 from hfs.preprocessing import HierarchicalPreprocessor
-from hfs.tan import Tan
+from hfs.hierarchical_selectors import HIP, HNB, MR, RNB, TAN, HieAODEBase, HNBs
 
 
 # Define data
@@ -39,6 +37,25 @@ def preprocess():
 
 
 train, test, train_y_data, test_y_data, hierarchy = preprocess()
+# %%
+"""
+=========================================================================
+HieAODE - 
+=========================================================================
+"""
+
+print("\nHieAODE:")
+# Initialize and fit HNB model with threshold k = 3 features to select
+model = HieAODEBase(hierarchy=hierarchy)
+model.fit_selector(X_train=train, y_train=train_y_data, X_test=test)
+# %%
+# Select features and predict
+predictions = model.select_and_predict(predict=True, saveFeatures=True)
+print(predictions)
+# %%
+# Calculate score
+score = model.get_score(test_y_data, predictions)
+print(score)
 
 """
 =========================================================================
@@ -59,7 +76,7 @@ def preprocess():
 score = model.get_score(test_y_data, predictions)
 print(score)
 
-
+# %%
 """
 =========================================================================
 HNB-s 
@@ -68,7 +85,7 @@ def preprocess():
 
 print("HNB-s:")
 # Initialize and fit HNBs model
-model = HNB(hierarchy=hierarchy)
+model = HNBs(hierarchy=hierarchy)
 model.fit_selector(X_train=train, y_train=train_y_data, X_test=test)
 
 # Select features and predict
@@ -88,7 +105,7 @@ def preprocess():
 
 print("\nRNB:")
 # Initialize and fit RNB model with threshold k = 3 features to select
-model = HNB(hierarchy=hierarchy)
+model = RNB(hierarchy=hierarchy)
 model.fit_selector(X_train=train, y_train=train_y_data, X_test=test)
 
 # Select features and predict
@@ -144,7 +161,7 @@ def preprocess():
 """
 print("\nTAN:")
 # Initialize and fit Tan model
-model = Tan(hierarchy=hierarchy)
+model = TAN(hierarchy=hierarchy)
 model.fit_selector(X_train=train, y_train=train_y_data, X_test=test)
 
 # Select features and predict

diff --git a/examples/rate_limits.db b/examples/rate_limits.db
diff --git a/experiments/experiments.py b/experiments/experiments.py
@@ -3,17 +3,12 @@
 
 import networkx as nx
 import pandas as pd
-from sklearn.naive_bayes import BernoulliNB
 from sklearn.metrics import accuracy_score, classification_report
-from hfs.data.data_utils import create_mapping_columns_to_nodes
+from sklearn.naive_bayes import BernoulliNB
 
-from hfs.hip import HIP
-from hfs.hnb import HNB
-from hfs.hnbs import HNBs
-from hfs.mr import MR
+from hfs.data_utils import create_mapping_columns_to_nodes
 from hfs.preprocessing import HierarchicalPreprocessor
-from hfs.rnb import RNB
-from hfs.tan import Tan
+from hfs.hierarchical_selectors import HIP, HNB, MR, RNB, TAN, HNBs
 
 
 def data():
@@ -81,12 +76,12 @@ def mr(hierarchy, train, y_train, test, y_test, k, columns, path):
 
 
 def tan(hierarchy, train, y_train, test, y_test, k, columns, path):
-    model = Tan(hierarchy=hierarchy)
+    model = TAN(hierarchy=hierarchy)
     model.fit_selector(X_train=train, y_train=y_train, X_test=test, columns=columns)
     pred = model.select_and_predict(predict=True, saveFeatures=True)
     score = model.get_score(y_test, pred)
     with open(path, "a") as file:
-        file.write("\nTan:\n")
+        file.write("\nTAN:\n")
         file.write(json.dumps(score))
 
 
@@ -99,11 +94,11 @@ def hip(hierarchy, train, y_train, test, y_test, k, columns, path):
         file.write("\nHIP:\n")
         file.write(json.dumps(score))
 
-def naive_bayes(hierarchy, train, y_train, test, y_test, k, columns,path):
-    
+
+def naive_bayes(hierarchy, train, y_train, test, y_test, k, columns, path):
     clf = BernoulliNB()
     clf.fit(train, y_train)
-    predictions =  clf.predict(test)
+    predictions = clf.predict(test)
     score = classification_report(y_true=y_test, y_pred=predictions, output_dict=True)
     with open(path, "a") as file:
         file.write("\nBaseline:\n")
@@ -117,7 +112,7 @@ def evaluate(data, k):
     preprocessor.fit(train, columns=columns)
     train = preprocessor.transform(train)
     test = preprocessor.transform(test)
-    
+
     hierarchy = preprocessor.get_hierarchy()
     graph = nx.DiGraph(hierarchy)
     columns = create_mapping_columns_to_nodes(pd.DataFrame(train), graph)
@@ -134,7 +129,7 @@ def evaluate(data, k):
             y_test=y_test,
             k=k,
             columns=columns,
-            path = path
+            path=path,
         )