cancervariants · jsstevenson · Jan 19, 2024 · Jan 19, 2024
diff --git a/.flake8 b/.flake8
diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml
@@ -19,3 +19,17 @@ jobs:
         run: |
           python -m pip install pipenv
           pipenv install --skip-lock  # this is what Elastic beanstalk uses
+  lint:
+    name: lint
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+
+      - name: Install dependencies
+        run: python3 -m pip install '.[dev]'
+
+      - name: Check style
+        run: python3 -m ruff check . && ruff format --check .
diff --git a/.gitignore b/.gitignore
@@ -130,12 +130,11 @@ analysis/graph/*.ipynb
 
 # Build files
 Pipfile.lock
-pyproject.toml
 
 # DynamoDB
 dynamodb_local_latest/
 
 # Zip
 *.zip
 
-notebooks
+notebooks
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,12 +1,16 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
+  - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v1.4.0
     hooks:
-    - id: flake8
-      additional_dependencies: [flake8-docstrings]
-    - id: check-added-large-files
-      args: ['--maxkb=1024']
-      exclude: ^tests/data
-    - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1024']
+        exclude: ^tests/data
+      - id: detect-private-key
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.14
+    hooks:
+      - id: ruff-format
+      - id: ruff
+        args: [ --fix, --exit-non-zero-on-fix ]
diff --git a/analysis/civic/examples/harvester/civic_harvester_example.py b/analysis/civic/examples/harvester/civic_harvester_example.py
@@ -8,32 +8,35 @@
 def create_evidence_examples(data):
     """Create five CIViC evidence examples."""
     evidence_items = list()
-    for i in range(len(data['evidence'])):
-        if data['evidence'][i]['assertions']:
-            evidence_items.append(data['evidence'][i])
+    for i in range(len(data["evidence"])):
+        if data["evidence"][i]["assertions"]:
+            evidence_items.append(data["evidence"][i])
         if len(evidence_items) == 6:
             break
 
     for evidence_item in evidence_items:
-        variant_id = evidence_item['variant_id']
-        gene_id = evidence_item['gene_id']
-        assertions = evidence_item['assertions']
+        variant_id = evidence_item["variant_id"]
+        gene_id = evidence_item["gene_id"]
+        assertions = evidence_item["assertions"]
 
-        for v in data['variants']:
-            if v['id'] == variant_id:
+        for v in data["variants"]:
+            if v["id"] == variant_id:
                 variant = v
 
-        for g in data['genes']:
-            if g['id'] == gene_id:
+        for g in data["genes"]:
+            if g["id"] == gene_id:
                 gene = g
 
-        with open(f"{PROJECT_ROOT}/analysis/civic/examples/harvester/"
-                  f"{evidence_item['name']}.json", 'w+') as f:
+        with open(
+            f"{PROJECT_ROOT}/analysis/civic/examples/harvester/"
+            f"{evidence_item['name']}.json",
+            "w+",
+        ) as f:
             example = {
-                'EVIDENCE': evidence_item,
-                'GENE': gene,
-                'VARIANT': variant,
-                'ASSERTIONS': assertions
+                "EVIDENCE": evidence_item,
+                "GENE": gene,
+                "VARIANT": variant,
+                "ASSERTIONS": assertions,
             }
 
             json.dump(example, f, indent=4)
@@ -45,26 +48,30 @@ def create_variant_examples(data):
     """
     variants_ids = [12, 1, 221, 190]
     variants = list()
-    for i in range(len(data['variants'])):
-        if data['variants'][i]['id'] in variants_ids:
-            variants.append(data['variants'][i])
+    for i in range(len(data["variants"])):
+        if data["variants"][i]["id"] in variants_ids:
+            variants.append(data["variants"][i])
 
     for variant in variants:
-        with open(f"{PROJECT_ROOT}/analysis/civic/examples/harvester/"
-                  f"{variant['name'].lower()}.json", 'w+') as f:
-            variant['evidence_items'] = variant['evidence_items'][0]
+        with open(
+            f"{PROJECT_ROOT}/analysis/civic/examples/harvester/"
+            f"{variant['name'].lower()}.json",
+            "w+",
+        ) as f:
+            variant["evidence_items"] = variant["evidence_items"][0]
             f.write(json.dumps(variant, indent=4))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     c = CIViCHarvester()
     c.harvest()
-    latest = sorted((APP_ROOT / "data" / "civic" / "harvester").glob("civic_harvester_*.json"))[-1]  # noqa: E501
+    latest = sorted(
+        (APP_ROOT / "data" / "civic" / "harvester").glob("civic_harvester_*.json")
+    )[-1]
     with open(latest, "r") as f:
         civic_data = json.load(f)
 
-    civic_ex_dir =\
-        PROJECT_ROOT / 'analysis' / 'civic' / 'examples' / 'harvester'
+    civic_ex_dir = PROJECT_ROOT / "analysis" / "civic" / "examples" / "harvester"
     civic_ex_dir.mkdir(exist_ok=True, parents=True)
 
     create_evidence_examples(civic_data)

diff --git a/analysis/civic/examples/transform/civic_transform_example.py b/analysis/civic/examples/transform/civic_transform_example.py
@@ -1,28 +1,29 @@
 """Create an example json file for CIViC Transform."""
 import json
 
-from metakb import PROJECT_ROOT, APP_ROOT
+from metakb import APP_ROOT, PROJECT_ROOT
 from metakb.transform import CIViCTransform
 
 
 def create_civic_example(civic_data):
     """Create CIViC transform examples from list of evidence items."""
     ex = {
-        'statements': [],
-        'propositions': [],
-        'variation_descriptors': [],
-        'gene_descriptors': [],
-        'therapy_descriptors': [],
-        'disease_descriptors': [],
-        'methods': [],
-        'documents': []
+        "statements": [],
+        "propositions": [],
+        "variation_descriptors": [],
+        "gene_descriptors": [],
+        "therapy_descriptors": [],
+        "disease_descriptors": [],
+        "methods": [],
+        "documents": [],
     }
     supported_by_statement_ids = set()
-    for s in civic_data['statements']:
-        if s['id'] == 'civic.aid:6':
-            supported_by_statement_ids = \
-                {s for s in s['supported_by'] if s.startswith('civic.eid')}
-            supported_by_statement_ids.add(s['id'])
+    for s in civic_data["statements"]:
+        if s["id"] == "civic.aid:6":
+            supported_by_statement_ids = {
+                s for s in s["supported_by"] if s.startswith("civic.eid")
+            }
+            supported_by_statement_ids.add(s["id"])
             break
 
     proposition_ids = set()
@@ -32,56 +33,66 @@ def create_civic_example(civic_data):
     gids = set()
     methods = set()
     documents = set()
-    for s in civic_data['statements']:
-        if s['id'] in supported_by_statement_ids:
-            ex['statements'].append(s)
-            proposition_ids.add(s['proposition'])
-            vids.add(s['variation_descriptor'])
-            tids.add(s['therapy_descriptor'])
-            dids.add(s['disease_descriptor'])
-            methods.add(s['method'])
-            documents.update({d for d in s['supported_by'] if
-                             not d.startswith('civic.eid')})
+    for s in civic_data["statements"]:
+        if s["id"] in supported_by_statement_ids:
+            ex["statements"].append(s)
+            proposition_ids.add(s["proposition"])
+            vids.add(s["variation_descriptor"])
+            tids.add(s["therapy_descriptor"])
+            dids.add(s["disease_descriptor"])
+            methods.add(s["method"])
+            documents.update(
+                {d for d in s["supported_by"] if not d.startswith("civic.eid")}
+            )
 
-    for p in civic_data['propositions']:
-        if p['id'] in proposition_ids:
-            ex['propositions'].append(p)
+    for p in civic_data["propositions"]:
+        if p["id"] in proposition_ids:
+            ex["propositions"].append(p)
 
-    for v in civic_data['variation_descriptors']:
-        if v['id'] in vids:
-            ex['variation_descriptors'].append(v)
-            gids.add(v['gene_context'])
+    for v in civic_data["variation_descriptors"]:
+        if v["id"] in vids:
+            ex["variation_descriptors"].append(v)
+            gids.add(v["gene_context"])
 
-    for t in civic_data['therapy_descriptors']:
-        if t['id'] in tids:
-            ex['therapy_descriptors'].append(t)
+    for t in civic_data["therapy_descriptors"]:
+        if t["id"] in tids:
+            ex["therapy_descriptors"].append(t)
 
-    for d in civic_data['disease_descriptors']:
-        if d['id'] in dids:
-            ex['disease_descriptors'].append(d)
+    for d in civic_data["disease_descriptors"]:
+        if d["id"] in dids:
+            ex["disease_descriptors"].append(d)
 
-    for g in civic_data['gene_descriptors']:
-        if g['id'] in gids:
-            ex['gene_descriptors'].append(g)
+    for g in civic_data["gene_descriptors"]:
+        if g["id"] in gids:
+            ex["gene_descriptors"].append(g)
 
-    for m in civic_data['methods']:
-        if m['id'] in methods:
-            ex['methods'].append(m)
+    for m in civic_data["methods"]:
+        if m["id"] in methods:
+            ex["methods"].append(m)
 
-    for d in civic_data['documents']:
-        if d['id'] in documents:
-            ex['documents'].append(d)
+    for d in civic_data["documents"]:
+        if d["id"] in documents:
+            ex["documents"].append(d)
 
-    with open(PROJECT_ROOT / "analysis" / "civic" / "examples" /  # noqa: W504
-              "transform" / "civic_cdm_example.json", 'w+') as f2:
+    with open(
+        PROJECT_ROOT
+        / "analysis"
+        / "civic"
+        / "examples"
+        / "transform"
+        / "civic_cdm_example.json",
+        "w+",
+    ) as f2:
         json.dump(ex, f2, indent=4)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     civic = CIViCTransform()
     civic.transform()
     civic.create_json()
-    latest = sorted((APP_ROOT / "data" / "civic" / "transform").glob("civic_cdm_*.json"))[-1]  # noqa: E501
+    latest = sorted(
+        (APP_ROOT / "data" / "civic" / "transform").glob("civic_cdm_*.json")
+    )[-1]
     with open(latest, "r") as f:
         civic_data = json.load(f)
     create_civic_example(civic_data)
diff --git a/analysis/graph/db_helper.py b/analysis/graph/db_helper.py
@@ -1,19 +1,19 @@
 """Utility function to load/reload graph for development."""
-from metakb.database import Graph
-from metakb import APP_ROOT
 import json
 
+from metakb import APP_ROOT
+from metakb.database import Graph
 
 g = Graph(uri="bolt://localhost:7687", credentials=("neo4j", "admin"))
 g.clear()
 
-fpath = APP_ROOT / 'data' / 'civic' / 'transform' / 'civic_cdm.json'
-with open(fpath, 'r') as f:
+fpath = APP_ROOT / "data" / "civic" / "transform" / "civic_cdm.json"
+with open(fpath, "r") as f:
     items = json.load(f)
 
 count = 0
 for item in items:
-    if 'assertion' in item.keys():
+    if "assertion" in item.keys():
         continue
     else:
         g.add_transformed_data(item)

diff --git a/analysis/graph/missing_diseases_counts.txt b/analysis/graph/missing_diseases_counts.txt
@@ -0,0 +1,2 @@
+TALL and T-Cell Acute Lymphoid Leukemia, 6
+T-Cell Acute Lymphoid Leukemia, 6
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		TALL and T-Cell Acute Lymphoid Leukemia, 6
		T-Cell Acute Lymphoid Leukemia, 6