YerevaNN · philippguevorguian · Feb 26, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -10,6 +10,9 @@ jobs:
   test:
     name: Test
     runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
 
     steps:
     - uses: actions/checkout@v4
@@ -21,12 +24,33 @@ jobs:
       with:
         python-version: '3.10'
 
+    # - name: Remove flash-attn dependency
+    #   run: |
+    #     sed -i '/flash-attn/d' environment.yml
+
+    # - name: Remove conda-pack dependency
+    #   run: |
+    #     sed -i '/conda-pack/d' environment.yml
+
+    # - name: Remove chemlactica dependency
+    #   run: |
+    #     sed -i '/chemlactica/d' environment.yml
+
     - name: Set up Conda
       uses: conda-incubator/setup-miniconda@v2
       with:
-        auto-update-conda: true
         environment-file: test_environment.yml
         activate-environment: testenv
+        auto-update-conda: true
+
+    - name: Install local chemlactica package within Conda environment
+      run: |
+        pip install .  # Install dependencies within the Conda environment
+
+    - name: Run unit tests
+      run: |
+        python3 confirm_tests.py --run unit
+
     - name: list commits on PR
       run: |
         response=$(curl --request GET \

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,3 +13,8 @@ repos:
     rev: 6.0.0
     hooks:
     -   id: flake8
+
+-   repo: https://github.com/rhysd/actionlint
+    rev: v1.6.26
+    hooks:
+    -   id: actionlint
diff --git a/chemlactica/train.py b/chemlactica/train.py
@@ -145,6 +145,7 @@ def train(
 
     broadcast_object_list(experiment_hash_list)
     print(f"Process {accelerator.process_index} aim hash: {experiment_hash_list[0]}")
+    experiment_hash = experiment_hash_list[0]
 
     if not valid_batch_size:
         valid_batch_size = train_batch_size

diff --git a/chemlactica/utils/dataset_utils.py b/chemlactica/utils/dataset_utils.py
@@ -10,12 +10,14 @@
 
 
 def load_jsonl_line(jsonl_line):
-    _maybe_compound_dict = json.loads(jsonl_line)
-    if isinstance(_maybe_compound_dict, dict):
-        return _maybe_compound_dict
-    else:
-        return json.loads(_maybe_compound_dict)
-    return json.loads(jsonl_line)
+    try:
+        _maybe_compound_dict = json.loads(jsonl_line)
+        if isinstance(_maybe_compound_dict, dict):
+            return _maybe_compound_dict
+        else:
+            return json.loads(_maybe_compound_dict)
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Error decoding JSON: {e}")
 
 
 def generate_assay_docs(examples, train_config):

diff --git a/confirm_tests.py b/confirm_tests.py
@@ -13,6 +13,19 @@ class TestType(enum.Enum):
     INTEGRATION = "integration"
 
 
+def print_test_details(unit_test_result):
+    print(f"\nTotal Tests: {unit_test_result.testsRun}")
+    print(f"Failures: {len(unit_test_result.failures)}")
+    print(f"Errors: {len(unit_test_result.errors)}")
+    print(f"Skipped: {len(unit_test_result.skipped)}")
+    print(f"Successful: {unit_test_result.wasSuccessful()}")
+    if unit_test_result.failures or unit_test_result.errors:
+        print("\nDetails about failures and errors:")
+        for failure in unit_test_result.failures + unit_test_result.errors:
+            print(f"\nTest: {failure[0]}")
+            print(f"Details: {failure[1]}")
+
+
 def write_test_status(
     git_commit_hash: str, status: str = "FAIL", file_name: str = "test_status"
 ):
@@ -31,16 +44,11 @@ def run_unit_tests():
     loader = unittest.TestLoader()
     # Discover and load unit tests
     unit_test_suite = loader.discover("unit_tests", pattern="*test*")
-    for test in unit_test_suite:
-        print(test)
 
     # Run the unit tests
-    runner = unittest.TextTestRunner()
+    runner = unittest.TextTestRunner(failfast=False, verbosity=2)
     result = runner.run(unit_test_suite)
-    if result.wasSuccessful():
-        print("All tests passed!")
-    else:
-        print("Some tests failed.")
+    print_test_details(result)
 
 
 if __name__ == "__main__":
@@ -87,7 +95,7 @@ def run_unit_tests():
     confirm = args.confirm
     gpus = args.gpus
     if run is not None:
-        match run:
+        match (run):
             case TestType.UNIT:
                 run_unit_tests()
             case TestType.INTEGRATION:

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,3 +5,6 @@ description = "Language modelling for chemistry by YerevanN"
 readme = "README.md"
 requires-python = ">=3.8"
 license = {text = "MIT"}
+
+[tool.setuptools]
+packages = ["chemlactica"]
diff --git a/requirements.txt b/requirements.txt
diff --git a/test_environment.yml b/test_environment.yml
@@ -1,4 +1,4 @@
-name: test_cl11.8_t_4.37
+name: testenv
 channels:
   - pytorch
   - nvidia

diff --git a/test_status.yaml b/test_status.yaml
@@ -1 +1 @@
-fe40dade26e27de4bd050161752291203ce9d39a: PASS
+86eb89a6651607c56835456eb9d2f0ae6cd222cc: PASS
diff --git a/unit_tests/test_something.py b/unit_tests/test_something.py
@@ -1,7 +1,55 @@
 import unittest
+import torch
+from chemlactica.utils.dataset_utils import load_jsonl_line
+from chemlactica.utils.dataset_utils import group_texts
+from unittest.mock import Mock
 
 
 class TestDataProcessing(unittest.TestCase):
-    def test_something(self):
+    def test_positive(self):
         result = 2 + 2
         self.assertEqual(result, 4, "Expected result: 4")
+
+
+class TestLoadJsonlLine(unittest.TestCase):
+    #  Can load a valid JSONL line as a dictionary
+    def test_load_valid_jsonl_line_as_dict(self):
+        jsonl_line = """{"key": "value"}"""
+        loaded_line = load_jsonl_line(jsonl_line)
+        assert load_jsonl_line(jsonl_line) == loaded_line
+
+    #  Returns None when given an empty string
+    def test_raise_value_error_empyty_line(self):
+        jsonl_line = ""
+        with self.assertRaises(ValueError):
+            load_jsonl_line(jsonl_line)
+
+
+class TestGroupTexts(unittest.TestCase):
+    def test_empty_attention_mask(self):
+        # Mock the get_tokenizer function
+        mocker = Mock()
+        mocker.eos_token_id = 0
+        mocker.return_value = mocker
+
+        # Create example input tensors
+        examples = {"input_ids": [torch.tensor([1, 2, 3])], "attention_mask": []}
+
+        # Set train_config
+        train_config = {"tokenizer_path": "path/to/tokenizer", "block_size": 3}
+
+        # Call the group_texts function
+        with self.assertRaises(Exception):
+            group_texts(examples, train_config)
+
+    # def test_splits_into_correct_size_chunks(self):
+    #     mocker = Mock()
+    #     mocker.eos_token_id = 0
+    #     mocker.return_value = mocker
+    #     train_config = {"block_size": 2037}
+    #     examples = {"input_ids": [torch.tensor([1, 2, 3])], "attention_mask": []}
+
+    #     self.assertTrue(all(len(ids) ==
+    # train_config["block_size"] for ids in result["input_ids"]))
+    #     self.assertTrue(all(len(mask) ==
+    # train_config["block_size"] for mask in result["attention_mask"]))
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		fe40dade26e27de4bd050161752291203ce9d39a: PASS
		86eb89a6651607c56835456eb9d2f0ae6cd222cc: PASS