azoz01 · azoz01 · Oct 26, 2023 · Jul 9, 2023 · Jul 9, 2023 · Jul 9, 2023
diff --git a/.github/workflows/code_check.yml b/.github/workflows/code_check.yml
@@ -13,24 +13,37 @@ jobs:
       - name: Set up Python 3.10
         uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
-
+          python-version: '3.10.9'
+          check-latest: true
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
 
-      - name: Check black
-        run: black .
+      - name: Check black source code
+        run: black liltab --line-length=100
+
+      - name: Check black test code
+        run: black test --line-length=100
+
+      - name: Check black bin code
+        run: black bin --line-length=100
 
-      - name: Check flake8
-        run: flake8
+      - name: Check flake8 source code
+        run: flake8 liltab --max-line-length=100
+
+      - name: Check flake8 test code
+        run: flake8 test --max-line-length=100
 
+      - name: Check flake8 bin code
+        run: flake8 bin --max-line-length=100
+
       - name: Run test
         run: |
-          export PYTHONPATH=`pwd`/src
-          pytest -vv --cov=src --junitxml=pytest.xml --cov-report=term-missing | tee pytest-coverage.txt
-      
+          export PYTHONPATH=`pwd`
+          pytest -vv --cov=liltab --junitxml=pytest.xml --cov-report=term-missing | tee pytest-coverage.txt
+
       - name: Check test coverage
         run: |
           coverage_threshold=80
@@ -41,7 +54,6 @@ jobs:
           fi
 
       - name: Publish test report as an artifact
-        if: always()
         uses: actions/upload-artifact@v3
         with:
           name: pytest-results
@@ -50,10 +62,9 @@ jobs:
             pytest.xml
 
       - name: Pytest coverage comment
-        if: always()
         uses: MishaKav/pytest-coverage-comment@main
         with:
           pytest-coverage-path: pytest-coverage.txt
           title: coverage
           junitxml-path: pytest.xml
-          github-token: ${{github.token}}
+          github-token: ${{github.token}}
diff --git a/.gitignore b/.gitignore
@@ -160,4 +160,8 @@ cython_debug/
 #.idea/
 
 .vscode/
-.venv/
+.venv/
+
+.coverage
+lightning_logs/*
+*.ipynb
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2018 The Python Packaging Authority
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,17 +1,128 @@
 # liltab
+liltab is a meta-learning package written in Python based on [1]. We implemented the inference network using PyTorch. In addition to the model, we provide a complete data loading and training API which in overall results in the possibility of end-to-end model creation. We also integrated our package with Tensorboard to monitor the training process in a real-time manner.
 
-## Dev setup
+## Installation
+To install package you need simply execute following statement:
+``` bash
+pip install liltab
+```
+
+## Why use liltab?
+In case when you have few observations in your dataset and you want to have out of the box model which can be treated as a good starting point for further research, liltab is perfect solution. We provide model, which can be pretrained on any tabular data and then applied to specific tasks with no further training required. 
+
+## How to use liltab?
+Assume that you have plenty of `.csv` files with data with variable dimensionality. First of all you need to split it to three directories - `train`, `val` and `test`.  Next, you need to create data loaders from directory. The fastest approach is to use `ComposedDataLoaderFactory`. It hast plenty of parameters:
+* `path` - path to data stored in csv format. Should contain only csv.
+* `dataset_cls` - class which will encapsulate csv data to torch dataset. Either `PandasDataset` or `RandomFeaturesPandasDataset`
+* `dataset_creation_args` -  Arguments passed to dataset constructors. Defaults to None.
+* `loader_cls` - Class which will be used to load data from datasets. As for now only `FewShotDataLoader`.
+* `dataloader_creation_args` - Arguments passed to dalaoader constructor. See `FewShotDataLoader` docstrings.
+* `composed_dataloader_cls` - Class encapsulating all created dataloaders. Either `ComposedDataLoader` or `RepeatableOutputComposedDataLoader`.
+* `batch_size` - size of batch which created dataloader will return.
+
+Let's explain what particular classes do:
+* `PandasDataset` - simple dataset encapsulating `.csv` data. Indexable class which on each indexing returns `X` and `y` tensors according to selected attributes and responses columns.
+* `RandomFeaturesPandasDataset` - same as above, but on each indexing returns random subset of attributes and responses form data.
+* `FewShotDataLoader` - iterable class which loads data with few-shot learning manner i.e. in each iterations returns `X_support`, `y_support`, `X_query`, `y_query`.
+* `ComposedDataLoader` - iterable class coposing multiple instances of `FewShotDataLoader`. In each iteration returns observations from randomly selected lodaers.
+* `RepeatableOutputComposedDataLoader` - Same as above, but in each iteration returns same sequence of examples. Useful during model validation.
+
+Example of data loader creation:
+``` Python
+ComposedDataLoaderFactory.create_composed_dataloader_from_path(
+    path=Path("train"),
+    dataset_cls=RandomFeaturesPandasDataset,
+    dataset_creation_args={},
+    loader_cls=FewShotDataLoader,
+    dataloader_creation_args={"support_size": 3, "query_size": 29},
+    composed_dataloader_cls=ComposedDataLoader,
+    batch_size=32,
+)
+```
+Having datasets created, we can create model using `HeterogenousAttributesNetwork` using following parameters:
+* `hidden_representation_size` - Size of hidden representation sizes i. e. all intermediate network outputs.
+* `n_hidden_layers` - number hidden layers of networks used during inference.
+* `hidden_size` - number of neurons per hidden layer in networks using during inference.
+* `dropout_rate` - dropout rate of networks used during inference.
+* `inner_activation_function` - inner activation function of networks used during inference.
+* `output_activation_function` - output activation function of final network used during inference.
+* `is_classifier` - if `True` then the output of the network will generate probabilities of classes for the query set.
+
+Example of model creation:
+``` Python
+HeterogenousAttributesNetwork(
+    hidden_representation_size=16,
+    n_hidden_layers=1,
+    hidden_size=16,
+    dropout_rate=0.2,
+    inner_activation_function=nn.ReLU(),
+    output_activation_function=nn.Identity(),
+    is_classifier=False
+)
+```
+
+Finally we can create object responsible for training - `HeterogenousAttributesNetworkTrainer` using params:
+* `n_epochs` - number of epochs to train
+* `gradient_clipping` - if `True`, then gradient clipping is applied
+* `learning_rate` - learning rate used during training,
+* `weight_decay` - weight decay used during training,
+* `early_stopping` - if `True` then early stopping is applied,
+* `file_logger` - if `True` then logging to `.csv` file is used,
+* `tb_logger` - if `True` then logging to Tensorboard is used,
+* `model_checkpoints` - if `True` then model checkpoints are used,
+* `results_path` - path to results directory.
+
+Example of trainer creation:
+``` Python
+HeterogenousAttributesNetworkTrainer(
+    n_epochs=100_000,
+    gradient_clipping=False,
+    learning_rate=1e-3,
+    weight_decay=1e-4,
+    early_stopping=True,
+    file_logger=True,
+    tb_logger=True,
+    model_checkpoints=True,
+    results_path=Path("sample_results"),
+)
+```
+
+Finally to train model you need to call `train_and_test` method e. g.
+``` Python
+trainer.train_and_test(
+    model=model,
+    train_loader=train_loader,
+    val_loader=val_loader,
+    test_loader=test_loader,
+)
+```
+
+**For complete examples of usage see `experiments` directory.**
+## Dev
 You need to have Python 3.10 and pip.
 
 To get dependencies run: 
-```
+``` bash
 pip install -r requirements.txt
 ```
 
-## Running tests
-You need to have all requirements installed
+To format and check code with linter run:
+``` bash
+make prepare_code
+```
 
 Run tests using following: 
+``` bash
+make run_tests
 ```
-export PYTHONPATH=`pwd`/src && pytest
-```
+
+## Authors
+Package was created as a result of thesis by
+* **Antoni Zajko**,
+* **Dawid Płudowski**.
+
+Project co-ordinator and supervisor: **Anna Kozak**
+## References
+[1] Iwata, T. and Kumagai, A. (2020). Meta-learning from Tasks with
+Heterogeneous Attribute Spaces. In Advances in Neural Information Processing Systems,
+volume 33, pages 6053–6063. Curran Associates, Inc.
diff --git a/bin/train.py b/bin/train.py
@@ -0,0 +1,128 @@
+import typer
+import yaml
+import pytorch_lightning as pl
+import warnings
+
+from liltab.data.datasets import PandasDataset, RandomFeaturesPandasDataset
+from liltab.data.dataloaders import (
+    FewShotDataLoader,
+    ComposedDataLoader,
+    RepeatableOutputComposedDataLoader,
+)
+from liltab.data.factory import ComposedDataLoaderFactory
+from liltab.model.heterogenous_attributes_network import HeterogenousAttributesNetwork
+from liltab.train.trainer import HeterogenousAttributesNetworkTrainer
+from liltab.train.logger import TensorBoardLogger, FileLogger
+from loguru import logger
+from typing_extensions import Annotated
+from pathlib import Path
+
+warnings.filterwarnings("ignore")
+app = typer.Typer()
+
+
+@app.command(help="Trains network on heterogenous attribute spaces.")
+def main(
+    config_path: Annotated[Path, typer.Option(..., help="Path to experiment configuration.")],
+    logger_type: Annotated[
+        str,
+        typer.Option(
+            ...,
+            help="""typer of logger. tb=[tensorboard],
+            flat=[flat file], both=[tensoboard and flat file]""",
+        ),
+    ] = "both",
+    use_profiler: Annotated[
+        str,
+        typer.Option(
+            ...,
+            help="""""use profiler (take long time, 8-10 epoches suggested),
+            yes or no; requires tensorboard (logger-type=[tb|both])""",
+        ),
+    ] = "no",
+    seed: Annotated[int, typer.Option(..., help="Seed")] = 123,
+):
+    pl.seed_everything(seed)
+
+    logger.info("Loading config")
+    with open(config_path) as f:
+        config = yaml.load(f, Loader=yaml.CLoader)
+
+    logger.info("Loading data")
+    train_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
+        Path(config["train_data_path"]),
+        RandomFeaturesPandasDataset,
+        {},
+        FewShotDataLoader,
+        {"support_size": config["support_size"], "query_size": config["query_size"]},
+        ComposedDataLoader,
+        batch_size=config["batch_size"],
+    )
+    val_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
+        Path(config["val_data_path"]),
+        PandasDataset,
+        {},
+        FewShotDataLoader,
+        {"support_size": config["support_size"], "query_size": config["query_size"]},
+        RepeatableOutputComposedDataLoader,
+        batch_size=config["batch_size"],
+    )
+    test_loader = ComposedDataLoaderFactory.create_composed_dataloader_from_path(
+        Path(config["test_data_path"]),
+        PandasDataset,
+        {},
+        FewShotDataLoader,
+        {"support_size": config["support_size"], "query_size": config["query_size"]},
+        RepeatableOutputComposedDataLoader,
+        batch_size=config["batch_size"],
+    )
+
+    logger.info("Creating model")
+    model = HeterogenousAttributesNetwork(
+        hidden_representation_size=config["hidden_representation_size"],
+        n_hidden_layers=config["n_hidden_layers"],
+        hidden_size=config["hidden_size"],
+        dropout_rate=config["dropout_rate"],
+    )
+
+    if logger_type == "tb":
+        tb_logger = TensorBoardLogger(
+            "results/tensorboard",
+            name=config["name"],
+            use_profiler=True if use_profiler == "yes" else False,
+        )
+        file_logger = None
+    elif logger_type == "flat":
+        tb_logger = None
+        file_logger = FileLogger("results/flat")
+    elif logger_type == "both":
+        tb_logger = TensorBoardLogger(
+            "results/tensorboard",
+            name=config["name"],
+            use_profiler=True if use_profiler == "yes" else False,
+        )
+        file_logger = FileLogger("results/flat")
+    else:
+        raise ValueError("logger_type must from [tb, flat, both]")
+
+    trainer = HeterogenousAttributesNetworkTrainer(
+        n_epochs=config["num_epochs"],
+        gradient_clipping=config["gradient_clipping"],
+        learning_rate=config["learning_rate"],
+        weight_decay=config["weight_decay"],
+        early_stopping=config["early_stopping"],
+        file_logger=file_logger,
+        tb_logger=tb_logger,
+    )
+
+    logger.info("Training model")
+    trainer.train_and_test(
+        model=model,
+        train_loader=train_loader,
+        val_loader=val_loader,
+        test_loader=test_loader,
+    )
+
+
+if __name__ == "__main__":
+    app()
diff --git a/config/01_synthetic_data_experiment_config.yaml b/config/01_synthetic_data_experiment_config.yaml
@@ -0,0 +1,20 @@
+name: "synthetic"
+num_epochs: 100000
+learning_rate: 0.001
+weight_decay: 0
+batch_size: 256
+gradient_clipping: False
+early_stopping: True
+
+support_size: 5
+query_size: 27
+
+hidden_representation_size: 32
+n_hidden_layers: 3
+hidden_size: 32
+dropout_rate: 0.1
+is_classifier: False
+
+train_data_path: data/01_synthetic/train
+val_data_path: data/01_synthetic/val
+test_data_path: data/01_synthetic/test