oughtinc · pre-commit-ci · Oct 7, 2024 · Oct 7, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 minimum_pre_commit_version: "2.9.0"
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
     hooks:
       - id: check-yaml
         args: [--allow-multiple-documents]
@@ -15,20 +15,20 @@ repos:
     hooks:
       - id: prettier
   - repo: https://github.com/asottile/reorder-python-imports
-    rev: v3.10.0
+    rev: v3.13.0
     hooks:
       - id: reorder-python-imports
         args: [--py39-plus]
   - repo: https://github.com/psf/black
-    rev: 23.3.0
+    rev: 24.8.0
     hooks:
       - id: black
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 7.1.1
     hooks:
       - id: flake8
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.4.1
+    rev: v1.11.2
     hooks:
       - id: mypy
         additional_dependencies: [numpy, httpx, pytest, structlog, types-PyYAML]

diff --git a/ice/agent.py b/ice/agent.py
@@ -21,8 +21,7 @@
 except ImportError:
 
     class Tfew(Agent):
-        def __init__(self, *args, **kwargs):
-            ...
+        def __init__(self, *args, **kwargs): ...
 
 
 MACHINE_AGENTS = {

diff --git a/ice/cache.py b/ice/cache.py
@@ -1,6 +1,7 @@
 """
 Decorator for caching function results to disk
 """
+
 import asyncio
 import functools
 import inspect

diff --git a/ice/evaluation/evaluate_recipe_result.py b/ice/evaluation/evaluate_recipe_result.py
@@ -138,9 +138,7 @@ def __str__(self) -> str:
         correctness = (
             "Correct"
             if self.is_correct == True
-            else "Incorrect"
-            if self.is_correct == False
-            else "Not evaluated"
+            else "Incorrect" if self.is_correct == False else "Not evaluated"
         )
         return f"""{correctness}.
     - Predicted: {self.predicted}
@@ -196,13 +194,17 @@ def evaluated_classifications(self) -> list[EvaluatedClassification]:
 
         for i in range(0, max(len(recipe_classifications), len(gold_classifications))):
             evaluated_classification = EvaluatedClassification(
-                predicted=recipe_classifications[i]
-                if i < len(recipe_classifications)
-                else None,
+                predicted=(
+                    recipe_classifications[i]
+                    if i < len(recipe_classifications)
+                    else None
+                ),
                 gold=gold_classifications[i] if i < len(gold_classifications) else None,
-                classification_eq=self.classification_eq[i]
-                if i < len(self.classification_eq)
-                else None,
+                classification_eq=(
+                    self.classification_eq[i]
+                    if i < len(self.classification_eq)
+                    else None
+                ),
             )
 
             evaluated_classifications.append(evaluated_classification)

diff --git a/ice/evaluation/evaluation_report.py b/ice/evaluation/evaluation_report.py
@@ -382,9 +382,9 @@ def make_dashboard_row_df(self):
                 classification_summary.proportion_correct
             )
 
-            row[
-                f"Classification {i+1} # evaluated"
-            ] = classification_summary.num_evaluated
+            row[f"Classification {i+1} # evaluated"] = (
+                classification_summary.num_evaluated
+            )
 
         df = pd.DataFrame([row])
         df.to_csv(
@@ -408,9 +408,9 @@ def make_experiments_evaluation_df(self):
                 "ice_commit": latest_commit_hash(),
                 "document_id": result.document_id,
                 "split": result.gold_standard.split if result.gold_standard else None,
-                "experiment": result.gold_standard.experiment
-                if result.gold_standard
-                else None,
+                "experiment": (
+                    result.gold_standard.experiment if result.gold_standard else None
+                ),
                 "total_gs_quotes": len(
                     result.evaluated_excerpts.gold_standards_in_excerpts_results
                 ),
@@ -420,9 +420,9 @@ def make_experiments_evaluation_df(self):
                 "excerpts": result.evaluated_excerpts.excerpts,
                 "gs_quotes": result.evaluated_excerpts.gold_standards_str(),
                 "answer": result.answer,
-                "gs_answer": result.gold_standard.answer
-                if result.gold_standard
-                else None,
+                "gs_answer": (
+                    result.gold_standard.answer if result.gold_standard else None
+                ),
                 "answer_rating": result.answer_rating,
                 "failure_modes": result.failure_modes,
             }

diff --git a/ice/evaluation/summarize_experiment_evals.py b/ice/evaluation/summarize_experiment_evals.py
@@ -35,13 +35,17 @@ async def summarize_experiment_evals(results_file: str):
                     row.get("classification_1"),
                     row.get("classification_2"),
                 ],
-                answer_rating=None
-                if pd.isna(row.get("answer_rating"))
-                else int(row.get("answer_rating")),
+                answer_rating=(
+                    None
+                    if pd.isna(row.get("answer_rating"))
+                    else int(row.get("answer_rating"))
+                ),
                 elicit_commit=row.get("elicit_commit"),
-                failure_modes=None
-                if pd.isna(row.get("failure_modes"))
-                else row.failure_modes.split(","),
+                failure_modes=(
+                    None
+                    if pd.isna(row.get("failure_modes"))
+                    else row.failure_modes.split(",")
+                ),
             )
             for _, row in recipe_df.iterrows()
         ]

diff --git a/ice/metrics/gold_paragraphs.py b/ice/metrics/gold_paragraphs.py
@@ -1,6 +1,7 @@
 """
 Make a dataframe that contains the paragraphs that contain the gold standard quotes.
 """
+
 import asyncio
 from pathlib import Path
 from typing import Optional

diff --git a/ice/metrics/gold_standards.py b/ice/metrics/gold_standards.py
@@ -184,8 +184,7 @@ def get_gold_standards(
     question_short_name: Optional[str] = None,
     experiment: Optional[str] = None,
     model_type: None = None,
-) -> list[GoldStandard[Any]]:
-    ...
+) -> list[GoldStandard[Any]]: ...
 
 
 @overload
@@ -195,8 +194,7 @@ def get_gold_standards(
     document_id: Optional[str] = None,
     question_short_name: Optional[str] = None,
     experiment: Optional[str] = None,
-) -> list[GoldStandard[ParsedGoldStandardType]]:
-    ...
+) -> list[GoldStandard[ParsedGoldStandardType]]: ...
 
 
 def get_gold_standards(
@@ -226,8 +224,7 @@ def get_gold_standard(
     question_short_name: Optional[str] = None,
     experiment: Optional[str] = None,
     model_type: None = None,
-) -> Optional[GoldStandard[Any]]:
-    ...
+) -> Optional[GoldStandard[Any]]: ...
 
 
 @overload
@@ -237,8 +234,7 @@ def get_gold_standard(
     document_id: Optional[str] = None,
     question_short_name: Optional[str] = None,
     experiment: Optional[str] = None,
-) -> Optional[GoldStandard[ParsedGoldStandardType]]:
-    ...
+) -> Optional[GoldStandard[ParsedGoldStandardType]]: ...
 
 
 def get_gold_standard(

diff --git a/ice/paper.py b/ice/paper.py
@@ -93,9 +93,9 @@ def parse_txt(file: Path) -> list[dict]:
                                 "number": section_title_number(current_section),
                             }
                         ],
-                        "sectionType": "abstract"
-                        if current_section == "Abstract"
-                        else "main",
+                        "sectionType": (
+                            "abstract" if current_section == "Abstract" else "main"
+                        ),
                     }
                 )
     return body

diff --git a/ice/recipes/blinding_dynamic.py b/ice/recipes/blinding_dynamic.py
@@ -21,6 +21,7 @@
 - routledge-2006.pdf
 - vittengl-2009.pdf
 """
+
 import itertools
 from typing import Any
 from typing import Literal
@@ -344,9 +345,9 @@ async def run(self, paper: Paper):
         results_by_intervention: dict[str, dict[Group, dict[str, Any]]] = {}
         interventions = await self.interventions(paper)
         for intervention in interventions:
-            results_by_intervention[
-                intervention
-            ] = await self.blinding_for_intervention(paper, intervention)
+            results_by_intervention[intervention] = (
+                await self.blinding_for_intervention(paper, intervention)
+            )
 
         recipe_results: list[RecipeResult] = []
         for intervention in interventions:

diff --git a/ice/recipes/consort_flow/baselines.py b/ice/recipes/consort_flow/baselines.py
@@ -342,9 +342,11 @@ async def _all_options(
         except TooLongRequestError:
             selections = remove_lowest_perplexity(selections)
     return PaperQaAnswer(
-        answer=["The question is not answered in the text."]
-        if do_return_list
-        else "The question is not answered in the text.",
+        answer=(
+            ["The question is not answered in the text."]
+            if do_return_list
+            else "The question is not answered in the text."
+        ),
         support_candidates=texts,
         support_labels=[False for text in texts],
         support_scores=[t[1] for t in texts_with_perplexities],

diff --git a/ice/recipes/consort_flow/golds.py b/ice/recipes/consort_flow/golds.py
@@ -54,9 +54,11 @@ def paper_to_allocation_gold_standards(
         (
             f"The {exp.name} experiment included {len(exp.arms or [])} arms: {', '.join((arm.name for arm in exp.arms or []))}. How many participants were initially allocated to the {arm.name} arm of the {exp.name} experiment?",
             texts,
-            arm.allocated.quotes
-            if arm.allocated and isinstance(arm.allocated, SampleSize)
-            else [],
+            (
+                arm.allocated.quotes
+                if arm.allocated and isinstance(arm.allocated, SampleSize)
+                else []
+            ),
         )
         for exp in gs.parsed_answer.experiments
         for arm in (exp.arms or [])

diff --git a/ice/recipes/experiments_and_arms/recipes/best_passages.py b/ice/recipes/experiments_and_arms/recipes/best_passages.py
@@ -43,9 +43,11 @@ async def rank_passages_selector(
     )  # really small non-infinite number
     closest = min(
         samples,
-        key=lambda sample: abs(sample.final_answer - mean_score)
-        if sample.final_answer
-        else float("inf"),
+        key=lambda sample: (
+            abs(sample.final_answer - mean_score)
+            if sample.final_answer
+            else float("inf")
+        ),
     )
     return PassageWithReasoning(
         passage=closest.passage,
@@ -154,9 +156,9 @@ async def score(
 
         sorted_answers = sorted(
             answers,
-            key=lambda prs: prs.final_answer
-            if prs.final_answer is not None
-            else float("-inf"),
+            key=lambda prs: (
+                prs.final_answer if prs.final_answer is not None else float("-inf")
+            ),
             reverse=True,
         )
         return sorted_answers

diff --git a/ice/recipes/experiments_and_arms/recipes/name_experiments.py b/ice/recipes/experiments_and_arms/recipes/name_experiments.py
@@ -156,13 +156,15 @@ async def name_experiments(
     assert experiment_names.final_answer is not None
     return (
         gs_names,
-        [
-            strip_enumeration_prefix(exp_name)
-            for exp_name in standardized_answer.split("\n")
-            if exp_name.strip()
-        ]
-        if standardized_answer
-        else [],
+        (
+            [
+                strip_enumeration_prefix(exp_name)
+                for exp_name in standardized_answer.split("\n")
+                if exp_name.strip()
+            ]
+            if standardized_answer
+            else []
+        ),
         paragraphs_to_keep,
         [str(p) for p in paragraphs],
     )

diff --git a/ice/recipes/meta/eval_paper_qa/common_baselines.py b/ice/recipes/meta/eval_paper_qa/common_baselines.py
@@ -197,9 +197,11 @@ async def preselected_few_shot_qa_baseline(
         Demonstration(
             question=g.question,
             texts=g.gold_support,
-            answer=g.gold_answer
-            if isinstance(g.gold_answer, str)
-            else numbered_list(g.gold_answer).transform(),
+            answer=(
+                g.gold_answer
+                if isinstance(g.gold_answer, str)
+                else numbered_list(g.gold_answer).transform()
+            ),
         )
         for g in demonstration_examples
     ]

diff --git a/ice/recipes/meta/eval_paper_qa/types.py b/ice/recipes/meta/eval_paper_qa/types.py
@@ -60,8 +60,7 @@ async def __call__(
         __paper: Paper,
         __question: str,
         __gold_support: Optional[Sequence[str]] = None,
-    ) -> PaperQaAnswer[AnswerType_contra]:
-        ...
+    ) -> PaperQaAnswer[AnswerType_contra]: ...
 
 
 class AnswerEvalMethod(Protocol[AnswerType_contra]):
@@ -70,8 +69,7 @@ async def __call__(
         question: str,
         ground_truth: AnswerType_contra,
         prediction: AnswerType_contra,
-    ) -> tuple[bool, str]:
-        ...
+    ) -> tuple[bool, str]: ...
 
 
 class ClassificationEvalMethod(Protocol):
@@ -81,5 +79,4 @@ async def __call__(
         predictions: Sequence[bool],
         ground_truth: Sequence[str],
         scores: Optional[Sequence[float]] = None,
-    ) -> BinaryClassificationMetrics:
-        ...
+    ) -> BinaryClassificationMetrics: ...
diff --git a/ice/recipes/placebo_dialogs.py b/ice/recipes/placebo_dialogs.py
@@ -402,9 +402,11 @@ async def analyze_experiment(self, paper: Paper, experiment: Experiment):
                     experiment=experiment,
                     classifications=[
                         aggregate_used["answer"],
-                        "Placebo"
-                        if has_placebo_info
-                        else "No placebo or placebo not mentioned",
+                        (
+                            "Placebo"
+                            if has_placebo_info
+                            else "No placebo or placebo not mentioned"
+                        ),
                     ],
                     answer=placebo_result,
                     result=placebo_result,

diff --git a/ice/recipes/placebo_keyword_baseline.py b/ice/recipes/placebo_keyword_baseline.py
@@ -54,9 +54,11 @@ async def run(self, paper: Paper):
                     result=f"{placebo_answer.classification}: {placebo_answer.sentence}",
                     answer=f"{placebo_answer.classification}: {placebo_answer.sentence}",
                     classifications=[
-                        "Placebo"
-                        if placebo_answer.classification == "Placebo"
-                        else "No placebo or placebo not mentioned",
+                        (
+                            "Placebo"
+                            if placebo_answer.classification == "Placebo"
+                            else "No placebo or placebo not mentioned"
+                        ),
                         placebo_answer.classification,
                     ],
                     excerpts=[placebo_answer.sentence],

diff --git a/ice/recipes/primer/answer_by_dispatch/types.py b/ice/recipes/primer/answer_by_dispatch/types.py
@@ -7,8 +7,7 @@
 
 
 class QuestionRecipe(Protocol):
-    async def __call__(self, question: str) -> str:
-        ...
+    async def __call__(self, question: str) -> str: ...
 
 
 @dataclass