Allows use of pydantic 2 by dependents.

Some applications wants to use pydantic2 but have been blocked by strict constraint on pydantic1 in mmda. This changeset allows the basic library to be used with either major version (but preserves the 1.x requirement for specific models so as not to break anything in S2's SPP pipeline).
allenai · Jun 5, 2024 · 79fea27 · 79fea27
1 parent a39556d
commit 79fea27
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 19 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = 'mmda'
-version = '0.9.17'
+version = '0.9.18'
 description = 'MMDA - multimodal document analysis'
 authors = [
     {name = 'Allen Institute for Artificial Intelligence', email = '[email protected]'},
@@ -14,7 +14,7 @@ dependencies = [
         'pdfplumber==0.7.4',
         'requests',
         'pandas<2',
-        'pydantic<2',
+        'pydantic[settings]>=1,<3',
         'ncls==0.0.66',
         'necessary>=0.3.2',
 ]
@@ -73,13 +73,15 @@ pysbd_predictors = [
     'pysbd',
 ]
 heuristic_predictors = [
-    'tokenizers'
+    'tokenizers',
+    'pydantic>=1,<2',
 ]
 lp_predictors = [
     'layoutparser',
     'torch',
     'torchvision',
     'effdet',
+    'pydantic>=1,<2'
 ]
 hf_predictors = [
     'torch',
@@ -89,38 +91,45 @@ hf_predictors = [
 vila_predictors = [
     'vila>=0.5,<0.6',
     'transformers<4.34.0',
+    'pydantic>=1,<2',
 ]
 mention_predictor = [
     'transformers[torch]',
-    'optimum[onnxruntime]'
+    'optimum[onnxruntime]',
+    'pydantic>=1,<2',
 ]
 mention_predictor_gpu = [
     'transformers[torch]',
     'optimum[onnxruntime-gpu]',
+    'pydantic>=1,<2',
 ]
 bibentry_predictor = [
     'transformers',
     'unidecode',
     'torch',
     'optimum[onnxruntime]',
+    'pydantic>=1,<2',
 ]
 bibentry_predictor_gpu = [
     'transformers',
     'unidecode',
     'torch',
     'optimum[onnxruntime-gpu]',
+    'pydantic>=1,<2'
 ]
 bibentry_detection_predictor = [
     'Pillow<10',
     'layoutparser',
     'torch==1.8.0+cu111',
     'torchvision==0.9.0+cu111',
+    'pydantic<=1,<2',
 ]
 citation_links = [
     'numpy',
     'thefuzz[speedup]',
     'scikit-learn',
     'xgboost',
+    'pydantic>=1,<2',
 ]
 section_nesting = [
     'numpy',
@@ -129,12 +138,14 @@ section_nesting = [
 ]
 figure_table_predictors = [
     'scipy',
+    'pydantic>=1,<2',
 ]
 svm_word_predictor = [
     'scikit-learn',
     'scipy',
     'numpy',
-    'tokenizers'
+    'tokenizers',
+    'pydantic>=1,<2',
 ]
 recipes = [
     'layoutparser',

diff --git a/src/ai2_internal/api.py b/src/ai2_internal/api.py
@@ -1,10 +1,10 @@
-from typing import List, Optional, Type
+from typing import Any, List, Optional, Type
 
 from pydantic import BaseModel, Extra, Field
-from pydantic.fields import ModelField
 
 import mmda.types.annotation as mmda_ann
 
+
 __all__ = ["BoxGroup", "SpanGroup"]
 
 
@@ -34,7 +34,7 @@ def to_mmda(self) -> mmda_ann.Box:
 class Span(BaseModel):
     start: int
     end: int
-    box: Optional[Box]
+    box: Optional[Box] = None
 
     @classmethod
     def from_mmda(cls, span: mmda_ann.Span) -> "Span":
@@ -72,14 +72,20 @@ class Annotation(BaseModel, extra=Extra.ignore):
 
     @classmethod
     def get_metadata_cls(cls) -> Type[Attributes]:
-        attrs_field: ModelField = cls.__fields__["attributes"]
+        attrs_field = cls.__fields__["attributes"]
+
+        # pydantic v2
+        if hasattr(attrs_field, "annotation"):
+            return attrs_field.annotation
+
+        # pydantic v1
         return attrs_field.type_
 
 
 class BoxGroup(Annotation):
     boxes: List[Box]
-    id: Optional[int]
-    type: Optional[str]
+    id: Optional[int] = None
+    type: Optional[str] = None
 
     @classmethod
     def from_mmda(cls, box_group: mmda_ann.BoxGroup) -> "BoxGroup":
@@ -109,10 +115,10 @@ def to_mmda(self) -> mmda_ann.BoxGroup:
 
 class SpanGroup(Annotation):
     spans: List[Span]
-    box_group: Optional[BoxGroup]
-    id: Optional[int]
-    type: Optional[str]
-    text: Optional[str]
+    box_group: Optional[BoxGroup] = None
+    id: Optional[int] = None
+    type: Optional[str] = None
+    text: Optional[str] = None
 
     @classmethod
     def from_mmda(cls, span_group: mmda_ann.SpanGroup) -> "SpanGroup":

diff --git a/src/mmda/featurizers/citation_link_featurizers.py b/src/mmda/featurizers/citation_link_featurizers.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pandas as pd
-from pydantic import BaseModel
 import re
 from thefuzz import fuzz
 from typing import List, Tuple, Dict
@@ -49,7 +48,7 @@ def featurize(possible_links: List[CitationLink]) -> pd.DataFrame:
     df[JACCARD_ALPHA] = df.apply(lambda row: jaccard_alpha(row['source_text'], row['target_text']), axis=1)
     df[MATCH_FIRST_TOKEN] = df.apply(lambda row: match_first_token(row['source_text'], row['target_text']), axis=1)
     df[FIRST_POSITION] = df.apply(lambda row: first_position(row['source_text'], row['target_text']), axis=1)
-    
+
     # drop text columns
     X_features = df.drop(columns=['source_text', 'target_text'])
     return X_features
@@ -106,7 +105,7 @@ def match_numeric(source: str, target: str) -> float:
     for number in source_numerics:
         found = number in target_numerics
         token_found.append(found)
-    
+
     if False not in token_found:
         return 1
     else:
@@ -149,7 +148,7 @@ def match_source_tokens(source: str, target: str) -> float:
             if token != 'et' and token != 'al' and token != 'and':
                 found = token in target_tokens
                 token_found.append(found)
-        
+
         if False not in token_found:
             return 1
         else: