neurostuff · jdkent · Nov 25, 2024 · Oct 22, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -21,12 +21,14 @@ jobs:
         uses: actions/checkout@v2
 
       - name: Set up Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: '3.8'
 
       - name: Install dependencies
-        run: pip install .[tests]
+        run: pip install -e .[tests,participant_demographics,word_count]
 
       - name: Test with pytest
+        env:
+          OPENAI_API_KEY: "fake_key"
         run: pytest
diff --git a/__init__.py b/__init__.py
diff --git a/ns_pipelines/participant_demographics/__init__.py b/ns_pipelines/participant_demographics/__init__.py
@@ -1,3 +0,0 @@
-from .run import __main__ as run
-
-__all__ = ['run']

diff --git a/ns_pipelines/participant_demographics/clean.py b/ns_pipelines/participant_demographics/clean.py
@@ -2,49 +2,48 @@
 import numpy as np
 
 
-def clean_predictions(predictions):
-    # Clean known issues with GPT demographics predictions
-    predictions = [p for p in predictions if p and "groups" in p]
+def clean_prediction(prediction):
+    # Clean known issues with GPT demographics prediction
 
     meta_keys = ["pmid", "rank", "start_char", "end_char", "id"]
-    meta_keys = [k for k in meta_keys if k in predictions[0]]
-    
+    meta_keys = [k for k in meta_keys if k in prediction]
+
     # Convert JSON to DataFrame
-    predictions = pd.json_normalize(
-        predictions, record_path=["groups"],
+    prediction = pd.json_normalize(
+        prediction, record_path=["groups"],
         meta=meta_keys
         )
 
-    predictions.columns = predictions.columns.str.replace(' ', '_')
+    prediction.columns = prediction.columns.str.replace(' ', '_')
 
-    predictions = predictions.fillna(value=np.nan)
-    predictions["group_name"] = predictions["group_name"].fillna("healthy")
+    prediction = prediction.fillna(value=np.nan)
+    prediction["group_name"] = prediction["group_name"].fillna("healthy")
 
     # Drop rows where count is NA
-    predictions = predictions[~pd.isna(predictions["count"])]
+    prediction = prediction[~pd.isna(prediction["count"])]
 
     # Set group_name to healthy if no diagnosis
-    predictions.loc[
-        (predictions["group_name"] != "healthy") & (pd.isna(predictions["diagnosis"])),
+    prediction.loc[
+        (prediction["group_name"] != "healthy") & (pd.isna(prediction["diagnosis"])),
         "group_name",
     ] = "healthy"
 
     # If no male count, substract count from female count columns
-    ix_male_miss = (pd.isna(predictions["male_count"])) & ~(
-        pd.isna(predictions["female_count"])
+    ix_male_miss = (pd.isna(prediction["male_count"])) & ~(
+        pd.isna(prediction["female_count"])
     )
-    predictions.loc[ix_male_miss, "male_count"] = (
-        predictions.loc[ix_male_miss, "count"]
-        - predictions.loc[ix_male_miss, "female_count"]
+    prediction.loc[ix_male_miss, "male_count"] = (
+        prediction.loc[ix_male_miss, "count"]
+        - prediction.loc[ix_male_miss, "female_count"]
     )
 
     # Same for female count
-    ix_female_miss = (pd.isna(predictions["female_count"])) & ~(
-        pd.isna(predictions["male_count"])
+    ix_female_miss = (pd.isna(prediction["female_count"])) & ~(
+        pd.isna(prediction["male_count"])
     )
-    predictions.loc[ix_female_miss, "female_count"] = (
-        predictions.loc[ix_female_miss, "count"]
-        - predictions.loc[ix_female_miss, "male_count"]
+    prediction.loc[ix_female_miss, "female_count"] = (
+        prediction.loc[ix_female_miss, "count"]
+        - prediction.loc[ix_female_miss, "male_count"]
     )
 
-    return predictions
+    return {"groups": prediction.to_dict(orient="records")}
diff --git a/ns_pipelines/participant_demographics/run.py b/ns_pipelines/participant_demographics/run.py
@@ -1,96 +1,51 @@
 """ Extract participant demographics from HTML files. """
 import os
+
 from publang.extract import extract_from_text
 from openai import OpenAI
-from pathlib import Path
-import json
-import pandas as pd
 import logging
 
 from . import prompts
-from .clean import clean_predictions
+from .clean import clean_prediction
 
 from ns_pipelines.pipeline import IndependentPipeline
 
-def extract(extraction_model, extraction_client, docs, output_dir, prompt_set='', **extract_kwargs):
+
+def extract(extraction_model, extraction_client, text, prompt_set='', **extract_kwargs):
     extract_kwargs.pop('search_query', None)
 
     # Extract
     predictions = extract_from_text(
-        docs['body'].to_list(),
-        model=extraction_model, client=extraction_client,
+        text,
+        model=extraction_model,
+        client=extraction_client,
         **extract_kwargs
     )
 
-    # Add PMCID to predictions
-    for i, pred in enumerate(predictions):
-        if not pred:
-            logging.warning(f"No prediction for document {docs['pmid'].iloc[i]}")
-            continue
-        pred['pmid'] = int(docs['pmid'].iloc[i])
+    if not predictions:
+        logging.warning("No predictions found.")
+        return None, None
 
-    clean_preds = clean_predictions(predictions)
+    clean_preds = clean_prediction(predictions)
 
     return predictions, clean_preds
 
 
 def _load_client(model_name):
     if 'gpt' in model_name:
-        client = OpenAI(api_key=os.getenv('MYOPENAI_API_KEY'))
+        client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
 
     else:
         raise ValueError(f"Model {model_name} not supported")
 
     return client
 
+
 def _load_prompt_config(prompt_set):
     return getattr(prompts, prompt_set)
 
-def _save_predictions(predictions, clean_preds, extraction_model, prompt_set, output_dir):
-    short_model_name = extraction_model.split('/')[-1]
-    outname = f"{prompt_set}_{short_model_name}"
-    predictions_path = output_dir / f'{outname}.json'
-    clean_predictions_path = output_dir / f'{outname}_clean.csv'
-
-    json.dump(predictions, predictions_path.open('w'))
-
-    clean_preds.to_csv(
-        clean_predictions_path, index=False
-    )
-
-def __main__(extraction_model, docs_path, prompt_set, output_dir=None, **kwargs):
-    """ Run the participant demographics extraction pipeline. 
-
-    Args:
-        extraction_model (str): The model to use for extraction.
-        docs_path (str): The path to the csv file containing the documents.
-        prompt_set (str): The prompt set to use for the extraction.
-        output_dir (str): The directory to save the output files.
-        **kwargs: Additional keyword arguments to pass to the extraction function.
-    """
-
-    docs = pd.read_csv(docs_path)
-
-    extraction_client = _load_client(extraction_model)
 
-    prompt_config = _load_prompt_config(prompt_set)
-    if kwargs is not None:
-        prompt_config.update(kwargs)
-
-    output_dir = Path(output_dir)
-
-    predictions, clean_preds = extract(
-        extraction_model, extraction_client, docs,
-        **prompt_config
-    )
-
-    if output_dir is not None:
-         _save_predictions(predictions, clean_preds, extraction_model, prompt_set, output_dir)
-
-    return predictions, clean_preds
-
-
-def ParticipantDemographics(IndependentPipeline):
+class ParticipantDemographicsExtraction(IndependentPipeline):
     """Participant demographics extraction pipeline."""
 
     _version = "1.0.0"
@@ -100,7 +55,8 @@ def ParticipantDemographics(IndependentPipeline):
     def __init__(
         self,
         extraction_model,
-        prompt_set, inputs=("text",),
+        prompt_set,
+        inputs=("text",),
         input_sources=("pubget", "ace"),
         **kwargs
     ):
@@ -117,15 +73,16 @@ def _run(self, study_inputs, n_cpus=1):
         if self.kwargs is not None:
             prompt_config.update(self.kwargs)
 
+        with open(study_inputs["text"]) as f:
+            text = f.read()
 
         predictions, clean_preds = extract(
             self.extraction_model,
             extraction_client,
-            study_inputs["text"],
+            text,
             prompt_set=self.prompt_set,
             **prompt_config
         )
 
         # Save predictions
-
         return {"predictions": predictions, "clean_predictions": clean_preds}
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,8 +22,8 @@ participant_demographics = [
     "pandas",
     "numpy",
     "pydantic",
-    "publang",
-    "openai",
+    "publang @ git+https://github.com/adelavega/publang.git",
+    "openai"
 ]
 umls_disease = [
     "pandas",
@@ -39,10 +39,15 @@ word_count = [
 
 tests = [
     "pytest",
+    "pytest-recording",
+    "vcrpy",
 ]
 
 [tool.hatch.version]
 source = "vcs"
 
 [tool.hatch.build.hooks.vcs]
 version-file = "ns_pipelines/_version.py"
+
+[tool.hatch.metadata]
+allow-direct-references = true
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py.old b/setup.py.old
Original file line number	Diff line number	Diff line change
		@@ -1,3 +0,0 @@
		from .run import __main__ as run

		__all__ = ['run']