neurostuff · jdkent · Nov 25, 2024 · Oct 22, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,32 @@
+name: Install and Test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: testing-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+
+      - name: Install dependencies
+        run: pip install .[tests]
+
+      - name: Test with pytest
+        run: pytest
diff --git a/.gitignore b/.gitignore
@@ -106,3 +106,5 @@ venv.bak/
 *.swp
 .swo
 .swn
+
+_version.py
diff --git a/README b/README
diff --git a/README.md b/README.md
@@ -0,0 +1,129 @@
+# ns-text-extraction-workflows
+
+This repository contains pipelines and scripts for extracting features from text using Natural Language Processing (NLP), Large Language Models (LLMs), 
+and other algorithms across thousands of articles in the NeuroStore database.
+
+## Installation
+
+To install the necessary dependencies, run:
+
+    pip install -r requirements.txt
+
+
+## Usage
+### Running pipelines
+Executable workflows in `pipelines/{pipeline_name}/run.py` will take as input standardized pubget-style text inputs (1 row per article).
+
+
+Run all available pipelines and harmonize outputs using CLI (todo)
+
+
+### Pipeline outputs
+Pipeline results are output to `data/outputs/{input_hash}/{pipeline_name}.
+Outputs include extracted features `features.csv`, feature descriptions `descriptions.json`, and extraction information `info.json`.
+
+Pipeline outputs are not stored as part of this repository.
+See `ns-text-extraction-outputs` sub repository. 
+
+### Types of pipelines
+
+
+#### Each study is independently processed
+
+1) scenario 1: nothing changed
+2) scenario 2: a study was added
+3) scenario 3: a study was changed
+
+`info.json` in the output directory
+increment (value): 0
+date: 2021-09-01
+
+ns-pond: no hashing
+we will hash based on the inputs to the pipeline and then store the hash in the info.json in the output directory.
+
+have a place for the raw output of the API/external service. 
+raw.json
+and clean.json
+clean function for a pipeline output, that can be used to clean the output of a pipeline
+
+#### Each study is processed in the context of all other studies
+
+Have a dev version
+only include openaccess papers
+pipeline name plus version then hash runs
+pipeline/v1.0.0/hash_run-01
+
+the hash is just the hash of the pipeline config
+
+
+independent studies: copy over the studies that have been processed and havent been changed
+independent studies: re-run the pipeline on studies that have been changed
+
+
+## Notes
+
+# study independent results:
+/pipline_name/v1.0.0/conf-#000A/run-01/study-01/input.json
+                                      /study-02/input.json
+                                      /results.json
+
+/pipline_name/v1.0.0/conf-#000A/run-02/study-03/ 
+
+# study dependent results:
+/pipline_name/v1.0.0/#sbqA_run-01/study-01
+                                 /study-02
+/pipline_name/v1.0.0/#sbqA_run-02/study-01
+                                 /study-02
+                                 /study-03
+
+Re-Run study independent pipeline:
+1. Update with new - create new directory with only updated studies
+2. Force re-run for a given set of inputs (from a particular directory, we are not using inheritance here)
+
+Re-Run study dependent pipeline:
+1. Re-run all
+
+
+after update:
+database.study_results_table
+id, study, conf, run:
+0   01      #000A, 01
+1   02      #000A, 01
+2   03      #000A, 02
+
+
+after re-run:
+database.study_results_table
+id, study, conf, run:
+0   01      #000A, 01
+1   02      #000A, 01
+2   03      #000A, 02
+3   01      #000A, 02
+4   02      #000A, 02
+
+## Tf-idf gets it's own unique table
+## participant demographics get their own unique table
+
+
+## have a table for feature names?
+database.study_results_values_table
+id, study_results_table_fk, feature(name), value, certainty
+
+
+database.pipeline_table
+id, pipline_name, pipline_description, version, study_dependent?, ace_compatiable?, pubget_compat?, Derivative
+0, gpt3_embed, wat, 1.0.0, False, True, True, False
+1, HDBSCABN, wat, 1.0.0, True, False, False, True
+2, TF-IDF, wat, 1.0.0, True, False, True, False
+3, embed_and_HDBSCAN, wat, 1.0.0, True, True, True, False
+
+database.pipeline_configs_table
+id, pipline_fk, configuration, configuration_hash, 
+0, 0, {use_cheap_option: true}, #000A
+1, 1, {dimensions: 10}, #XXXX
+
+database.pipeline_run_table
+id, pipline_fk, config_hash_fk, run_index, description, date
+
+
+## TODO: how do I represent results in the database?
diff --git a/pipelines/__init__.py → ns_pipelines/__init__.py b/pipelines/__init__.py → ns_pipelines/__init__.py
diff --git a/ns_pipelines/dataset.py b/ns_pipelines/dataset.py
@@ -0,0 +1,163 @@
+"""Dataset creation for processing inputs."""
+from copy import deepcopy
+from dataclasses import dataclass, field
+from pathlib import Path
+import re
+import json
+from typing import Union, Optional
+
+INPUTS = [
+    "text",
+    "coordinates",
+    "metadata",
+    "html",
+    "xml",
+    "tables",
+    "tables_xml",
+]
+
+@dataclass
+class AceRaw:
+    html: Path
+
+@dataclass
+class PubgetRaw:
+    xml: Path
+    tables: dict = None
+    tables_xml: Path = None
+
+@dataclass
+class ProcessedData:
+    coordinates: Path = None
+    text: Path = None
+    metadata: Path = None
+    raw: Optional[Union['PubgetRaw', 'AceRaw']] = field(default=None)
+
+@dataclass
+class Study:
+    dbid: str
+    doi: str = None
+    pmid: str = None
+    pmcid: str = None
+    ace: ProcessedData = field(default_factory=ProcessedData)
+    pubget: ProcessedData = field(default_factory=ProcessedData)
+
+
+class Dataset:
+    """Dataset class for processing inputs."""
+
+    def __init__(self, input_directory):
+        """Initialize the dataset."""
+        self.data = self.load_directory(input_directory)
+
+    def slice(self, ids):
+        """Slice the dataset."""
+        deepcopy_obj = deepcopy(self)
+        deepcopy_obj.data = {k: v for k, v in deepcopy_obj.data.items() if k in ids}
+        return deepcopy_obj
+
+    def load_directory(self, input_directory):
+        """Load the input directory.
+        input_directory (str): The input directory containing the text.
+        processed (bool): Whether the input text is already processed.
+        source (str): The source of the input text.
+                      (ace or pubget, if None, tries to find both)
+        """
+        pattern = re.compile(r'^[a-zA-Z0-9]{12}$')
+
+        sub_directories = input_directory.glob("[0-9A-Za-z]*")
+
+        study_directories = [
+            dir_ for dir_ in sub_directories
+            if dir_.is_dir() and pattern.match(dir_.name)
+        ]
+
+        dset_data = {}
+
+        for study_dir in study_directories:
+
+            study_id = study_dir.name
+            study_obj = Study(dbid=study_id)
+            # associate IDs with study object
+            with open((study_dir / "identifiers.json"), "r") as ident_fp:
+                ids = json.load(ident_fp)
+
+            study_obj.doi = ids["doi"] or None
+            study_obj.pmid = ids["pmid"] or None
+            study_obj.pmcid = ids["pmcid"] or None
+
+            source_dir = study_dir / "source"
+
+            # check if the source ace directory exists and load appropriate files
+            if (source_dir / "ace").exists():
+                study_obj.ace.raw = AceRaw(html=source_dir / "ace" / f"{study_obj.pmid}.html")
+
+            # check if the source pubget directory exists and load appropriate files
+            if (source_dir / "pubget").exists():
+                study_obj.pubget.raw = PubgetRaw(
+                    xml=source_dir / "pubget" / f"{study_obj.pmcid}.xml",
+                )
+                study_obj.pubget.raw.tables_xml = source_dir / "pubget" / "tables" / "tables.xml"
+
+                tables_files = (source_dir / "pubget" / "tables").glob("*.xml")
+                tables_files = [t for t in tables_files if t.name != "tables.xml"]
+
+                num_tables = len(tables_files) // 2
+                study_obj.pubget.raw.tables = {
+                    '{0:03}'.format(t): {"metadata": None, "contents": None}
+                    for t in range(num_tables)
+                }
+
+                for tf in tables_files:
+                    table_number = tf.stem.split("_")[1]
+                    if tf.suffix == ".json":
+                        key = "metadata"
+                    else:
+                        key = "contents"
+
+                    study_obj.pubget.raw.tables[table_number][key] = tf
+
+            # processed directory
+            processed_dir = study_dir / "processed"
+            if (processed_dir / "ace").exists():
+                study_obj.ace.coordinates = processed_dir / "ace" / "coordinates.csv"
+                study_obj.ace.text = processed_dir / "ace" / "text.txt"
+                study_obj.ace.metadata = processed_dir / "ace" / "metadata.json"
+
+            if (processed_dir / "pubget").exists():
+                study_obj.pubget.coordinates = processed_dir / "pubget" / "coordinates.csv"
+                study_obj.pubget.text = processed_dir / "pubget" / "text.txt"
+                study_obj.pubget.metadata = processed_dir / "pubget" / "metadata.json"
+
+            dset_data[study_id] = study_obj
+
+        return dset_data
+
+    def __len__(self):
+        """Return the length of the dataset."""
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        """Return an item from the dataset."""
+        return self.data[idx]
+
+
+
+class PipelineInputFilter:
+    """Filter for pipeline inputs."""
+
+    def __init__(self, pipeline, output_directory, overwrite=False):
+        """Initialize the filter.
+
+        pipeline (Pipeline): The pipeline to filter.
+        output_directory (str): The output directory where the pipeline has been previously run.
+        overwrite (bool): Whether to overwrite the existing output
+        """
+
+    def filter(self, dataset):
+        """Filter the dataset."""
+        pass
+
+    def load_outputs(self):
+        """Load the outputs."""
+        pass
diff --git a/...ines/participant_demographics/__init__.py → ...ines/participant_demographics/__init__.py b/...ines/participant_demographics/__init__.py → ...ines/participant_demographics/__init__.py
diff --git a/pipelines/participant_demographics/clean.py → ...pelines/participant_demographics/clean.py b/pipelines/participant_demographics/clean.py → ...pelines/participant_demographics/clean.py
diff --git a/...lines/participant_demographics/prompts.py → ...lines/participant_demographics/prompts.py b/...lines/participant_demographics/prompts.py → ...lines/participant_demographics/prompts.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -106,3 +106,5 @@ venv.bak/ @@
     *.swp
     .swo
     .swn
+    _version.py