allenai · cmwilhelm · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = 'mmda'
-version = '0.9.17'
+version = '0.9.18'
 description = 'MMDA - multimodal document analysis'
 authors = [
     {name = 'Allen Institute for Artificial Intelligence', email = '[email protected]'},
@@ -14,7 +14,7 @@ dependencies = [
         'pdfplumber==0.7.4',
         'requests',
         'pandas<2',
-        'pydantic<2',
+        'pydantic[settings]>=1,<3',
         'ncls==0.0.66',
         'necessary>=0.3.2',
 ]
@@ -73,13 +73,15 @@ pysbd_predictors = [
     'pysbd',
 ]
 heuristic_predictors = [
-    'tokenizers'
+    'tokenizers',
+    'pydantic>=1,<2',
 ]
 lp_predictors = [
     'layoutparser',
     'torch',
     'torchvision',
     'effdet',
+    'pydantic>=1,<2'
 ]
 hf_predictors = [
     'torch',
@@ -89,38 +91,45 @@ hf_predictors = [
 vila_predictors = [
     'vila>=0.5,<0.6',
     'transformers<4.34.0',
+    'pydantic>=1,<2',
 ]
 mention_predictor = [
     'transformers[torch]',
-    'optimum[onnxruntime]'
+    'optimum[onnxruntime]',
+    'pydantic>=1,<2',
 ]
 mention_predictor_gpu = [
     'transformers[torch]',
     'optimum[onnxruntime-gpu]',
+    'pydantic>=1,<2',
 ]
 bibentry_predictor = [
     'transformers',
     'unidecode',
     'torch',
     'optimum[onnxruntime]',
+    'pydantic>=1,<2',
 ]
 bibentry_predictor_gpu = [
     'transformers',
     'unidecode',
     'torch',
     'optimum[onnxruntime-gpu]',
+    'pydantic>=1,<2'
 ]
 bibentry_detection_predictor = [
     'Pillow<10',
     'layoutparser',
     'torch==1.8.0+cu111',
     'torchvision==0.9.0+cu111',
+    'pydantic<=1,<2',
 ]
 citation_links = [
     'numpy',
     'thefuzz[speedup]',
     'scikit-learn',
     'xgboost',
+    'pydantic>=1,<2',
 ]
 section_nesting = [
     'numpy',
@@ -129,12 +138,14 @@ section_nesting = [
 ]
 figure_table_predictors = [
     'scipy',
+    'pydantic>=1,<2',
 ]
 svm_word_predictor = [
     'scikit-learn',
     'scipy',
     'numpy',
-    'tokenizers'
+    'tokenizers',
+    'pydantic>=1,<2',
 ]
 recipes = [
     'layoutparser',

diff --git a/src/ai2_internal/api.py b/src/ai2_internal/api.py
@@ -1,10 +1,10 @@
-from typing import List, Optional, Type
+from typing import Any, List, Optional, Type
 
 from pydantic import BaseModel, Extra, Field
-from pydantic.fields import ModelField
 
 import mmda.types.annotation as mmda_ann
 
+
 __all__ = ["BoxGroup", "SpanGroup"]
 
 
@@ -34,7 +34,7 @@ def to_mmda(self) -> mmda_ann.Box:
 class Span(BaseModel):
     start: int
     end: int
-    box: Optional[Box]
+    box: Optional[Box] = None
 
     @classmethod
     def from_mmda(cls, span: mmda_ann.Span) -> "Span":
@@ -72,14 +72,20 @@ class Annotation(BaseModel, extra=Extra.ignore):
 
     @classmethod
     def get_metadata_cls(cls) -> Type[Attributes]:
-        attrs_field: ModelField = cls.__fields__["attributes"]
+        attrs_field = cls.__fields__["attributes"]
+
+        # pydantic v2
+        if hasattr(attrs_field, "annotation"):
+            return attrs_field.annotation
+
+        # pydantic v1
         return attrs_field.type_
 
 
 class BoxGroup(Annotation):
     boxes: List[Box]
-    id: Optional[int]
-    type: Optional[str]
+    id: Optional[int] = None
+    type: Optional[str] = None
 
     @classmethod
     def from_mmda(cls, box_group: mmda_ann.BoxGroup) -> "BoxGroup":
@@ -109,10 +115,10 @@ def to_mmda(self) -> mmda_ann.BoxGroup:
 
 class SpanGroup(Annotation):
     spans: List[Span]
-    box_group: Optional[BoxGroup]
-    id: Optional[int]
-    type: Optional[str]
-    text: Optional[str]
+    box_group: Optional[BoxGroup] = None
+    id: Optional[int] = None
+    type: Optional[str] = None
+    text: Optional[str] = None
 
     @classmethod
     def from_mmda(cls, span_group: mmda_ann.SpanGroup) -> "SpanGroup":

diff --git a/src/mmda/featurizers/citation_link_featurizers.py b/src/mmda/featurizers/citation_link_featurizers.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pandas as pd
-from pydantic import BaseModel
 import re
 from thefuzz import fuzz
 from typing import List, Tuple, Dict
@@ -49,7 +48,7 @@ def featurize(possible_links: List[CitationLink]) -> pd.DataFrame:
     df[JACCARD_ALPHA] = df.apply(lambda row: jaccard_alpha(row['source_text'], row['target_text']), axis=1)
     df[MATCH_FIRST_TOKEN] = df.apply(lambda row: match_first_token(row['source_text'], row['target_text']), axis=1)
     df[FIRST_POSITION] = df.apply(lambda row: first_position(row['source_text'], row['target_text']), axis=1)
-    
+
     # drop text columns
     X_features = df.drop(columns=['source_text', 'target_text'])
     return X_features
@@ -106,7 +105,7 @@ def match_numeric(source: str, target: str) -> float:
     for number in source_numerics:
         found = number in target_numerics
         token_found.append(found)
-    
+
     if False not in token_found:
         return 1
     else:
@@ -149,7 +148,7 @@ def match_source_tokens(source: str, target: str) -> float:
             if token != 'et' and token != 'al' and token != 'and':
                 found = token in target_tokens
                 token_found.append(found)
-        
+
         if False not in token_found:
             return 1
         else: