diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b4a84f16..1b7232bb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,6 +31,13 @@ jobs: - name: Lint run: | flake8 medcat + - name: Pydantic 1 check + # NOTE: the following will look for use of pydantic1-specific .dict() method and .__fields__ attribute + # if there are some (that are not annotated for pydantic1 backwards compatibility) a non-zero exit + # code is returned, which will hald the workflow and print out the offending parts + run: | + grep "\.__fields__" medcat -rI | grep -v "# 4pydantic1 - backwards compatibility" | tee /dev/stderr | test $(wc -l) -eq 0 + grep "\.dict(" medcat -rI | grep -v "# 4pydantic1 - backwards compatibility" | tee /dev/stderr | test $(wc -l) -eq 0 - name: Test run: | all_files=$(git ls-files | grep '^tests/.*\.py$' | grep -v '/__init__\.py$' | sed 's/\.py$//' | sed 's/\//./g') @@ -54,7 +61,6 @@ jobs: repo: context.repo.repo }); core.setOutput('latest_version', latestRelease.data.tag_name); - - name: Make sure there's no deprecated methods that should be removed. # only run this for master -> production PR. I.e just before doing a release. if: github.event.pull_request.base.ref == 'main' && github.event.pull_request.head.ref == 'production' diff --git a/install_requires.txt b/install_requires.txt index 77b61082..136728d8 100644 --- a/install_requires.txt +++ b/install_requires.txt @@ -19,6 +19,6 @@ 'xxhash>=3.0.0' # allow later versions, tested with 3.1.0 'blis>=0.7.5,<1.0.0' # allow later versions, tested with 0.7.9, avoid 1.0.0 (depends on numpy 2) 'click>=8.0.4' # allow later versions, tested with 8.1.3 -'pydantic>=1.10.0,<2.0' # for spacy compatibility; avoid 2.0 due to breaking changes +'pydantic>=2.0.0,<3.0' # avoid next major release "humanfriendly~=10.0" # for human readable file / RAM sizes "peft>=0.8.2" diff --git a/medcat/cat.py b/medcat/cat.py index 707dbd7f..13042acd 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -590,7 +590,7 @@ def _print_stats(self, def _init_ckpts(self, is_resumed, checkpoint): if self.config.general.checkpoint.steps is not None or checkpoint is not None: - checkpoint_config = CheckpointConfig(**self.config.general.checkpoint.dict()) + checkpoint_config = CheckpointConfig(**self.config.general.checkpoint.model_dump()) checkpoint_manager = CheckpointManager('cat_train', checkpoint_config) if is_resumed: # TODO: probably remove is_resumed mark and always resume if a checkpoint is provided, diff --git a/medcat/config.py b/medcat/config.py index a1dd15e7..fb3ab5f0 100644 --- a/medcat/config.py +++ b/medcat/config.py @@ -1,6 +1,5 @@ from datetime import datetime -from pydantic import BaseModel, Extra, ValidationError -from pydantic.fields import ModelField +from pydantic import BaseModel, ValidationError from typing import List, Set, Tuple, cast, Any, Callable, Dict, Optional, Union, Type, Literal from multiprocessing import cpu_count import logging @@ -125,7 +124,7 @@ def merge_config(self, config_dict: Dict) -> None: attr = None # new attribute value = config_dict[key] if isinstance(value, BaseModel): - value = value.dict() + value = value.model_dump() if isinstance(attr, MixingConfig): attr.merge_config(value) else: @@ -177,7 +176,7 @@ def rebuild_re(self) -> None: def _calc_hash(self, hasher: Optional[Hasher] = None) -> Hasher: if hasher is None: hasher = Hasher() - for _, v in cast(BaseModel, self).dict().items(): + for _, v in cast(BaseModel, self).model_dump().items(): if isinstance(v, MixingConfig): v._calc_hash(hasher) else: @@ -189,7 +188,7 @@ def get_hash(self, hasher: Optional[Hasher] = None): return hasher.hexdigest() def __str__(self) -> str: - return str(cast(BaseModel, self).dict()) + return str(cast(BaseModel, self).model_dump()) @classmethod def load(cls, save_path: str) -> "MixingConfig": @@ -238,15 +237,15 @@ def asdict(self) -> Dict[str, Any]: Returns: Dict[str, Any]: The dictionary associated with this config """ - return cast(BaseModel, self).dict() + return cast(BaseModel, self).model_dump() - def fields(self) -> Dict[str, ModelField]: + def fields(self) -> dict: """Get the fields associated with this config. Returns: - Dict[str, ModelField]: The dictionary of the field names and fields + dict: The dictionary of the field names and fields """ - return cast(BaseModel, self).__fields__ + return cast(BaseModel, self).model_fields class VersionInfo(MixingConfig, BaseModel): @@ -272,7 +271,7 @@ class VersionInfo(MixingConfig, BaseModel): """Which version of medcat was used to build the CDB""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -290,7 +289,7 @@ class CDBMaker(MixingConfig, BaseModel): """Minimum number of letters required in a name to be accepted for a concept""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -303,7 +302,7 @@ class AnnotationOutput(MixingConfig, BaseModel): include_text_in_output: bool = False class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -317,7 +316,7 @@ class CheckPoint(MixingConfig, BaseModel): """When training the maximum checkpoints will be kept on the disk""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -354,7 +353,7 @@ class General(MixingConfig, BaseModel): NB! For these changes to take effect, the pipe would need to be recreated.""" checkpoint: CheckPoint = CheckPoint() - usage_monitor = UsageMonitor() + usage_monitor: UsageMonitor = UsageMonitor() """Checkpointing config""" log_level: int = logging.INFO """Logging config for everything | 'tagger' can be disabled, but will cause a drop in performance""" @@ -395,7 +394,7 @@ class General(MixingConfig, BaseModel): reliable due to not taking into account all the details of the changes.""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -424,7 +423,7 @@ class Preprocessing(MixingConfig, BaseModel): NB! For these changes to take effect, the pipe would need to be recreated.""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -444,7 +443,7 @@ class Ner(MixingConfig, BaseModel): """Try reverse word order for short concepts (2 words max), e.g. heart disease -> disease heart""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -579,7 +578,7 @@ class Linking(MixingConfig, BaseModel): """If true when the context of a concept is calculated (embedding) the words making that concept are not taken into account""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -600,7 +599,7 @@ class Config: # this if for word_skipper and punct_checker which would otherwise # not have a validator arbitrary_types_allowed = True - extra = Extra.allow + extra = 'allow' validate_assignment = True def __init__(self, *args, **kwargs): @@ -618,7 +617,7 @@ def rebuild_re(self) -> None: # Override def get_hash(self): hasher = Hasher() - for k, v in self.dict().items(): + for k, v in self.model_dump().items(): if k in ['hash', ]: # ignore hash continue @@ -674,4 +673,6 @@ def wrapper(*args, **kwargs): # we get a nicer exceptio _waf_advice = "You can use `cat.cdb.weighted_average_function` to access it directly" Linking.__getattribute__ = _wrapper(Linking.__getattribute__, Linking, _waf_advice, AttributeError) # type: ignore +if hasattr(Linking, '__getattr__'): + Linking.__getattr__ = _wrapper(Linking.__getattr__, Linking, _waf_advice, AttributeError) # type: ignore Linking.__getitem__ = _wrapper(Linking.__getitem__, Linking, _waf_advice, KeyError) # type: ignore diff --git a/medcat/config_meta_cat.py b/medcat/config_meta_cat.py index ef8f908f..0d6eb7a6 100644 --- a/medcat/config_meta_cat.py +++ b/medcat/config_meta_cat.py @@ -1,5 +1,5 @@ from typing import Dict, Any -from medcat.config import MixingConfig, BaseModel, Optional, Extra +from medcat.config import MixingConfig, BaseModel, Optional class General(MixingConfig, BaseModel): @@ -65,7 +65,7 @@ class General(MixingConfig, BaseModel): Otherwise defaults to doc._.ents or doc.ents per the annotate_overlapping settings""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -169,7 +169,7 @@ class Model(MixingConfig, BaseModel): """If set to True center positions will be ignored when calculating representation""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -191,7 +191,7 @@ class Train(MixingConfig, BaseModel): """If set only this CUIs will be used for training""" auto_save_model: bool = True """Should do model be saved during training for best results""" - last_train_on: Optional[int] = None + last_train_on: Optional[float] = None """When was the last training run""" metric: Dict[str, str] = {'base': 'weighted avg', 'score': 'f1-score'} """What metric should be used for choosing the best model""" @@ -206,7 +206,7 @@ class Train(MixingConfig, BaseModel): """Focal Loss hyperparameter - determines importance the loss gives to hard-to-classify examples""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -217,5 +217,5 @@ class ConfigMetaCAT(MixingConfig, BaseModel): train: Train = Train() class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True diff --git a/medcat/config_rel_cat.py b/medcat/config_rel_cat.py index dfa3b009..c16735d6 100644 --- a/medcat/config_rel_cat.py +++ b/medcat/config_rel_cat.py @@ -1,6 +1,6 @@ import logging from typing import Dict, Any, List -from medcat.config import MixingConfig, BaseModel, Optional, Extra +from medcat.config import MixingConfig, BaseModel, Optional class General(MixingConfig, BaseModel): @@ -89,7 +89,7 @@ class Model(MixingConfig, BaseModel): """If set to True center positions will be ignored when calculating representation""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -116,7 +116,7 @@ class Train(MixingConfig, BaseModel): """Should the model be saved during training for best results""" class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -127,5 +127,5 @@ class ConfigRelCAT(MixingConfig, BaseModel): train: Train = Train() class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True diff --git a/medcat/config_transformers_ner.py b/medcat/config_transformers_ner.py index 9f3102ac..e9661aaf 100644 --- a/medcat/config_transformers_ner.py +++ b/medcat/config_transformers_ner.py @@ -1,4 +1,4 @@ -from medcat.config import MixingConfig, BaseModel, Optional, Extra +from medcat.config import MixingConfig, BaseModel, Optional class General(MixingConfig, BaseModel): @@ -16,11 +16,11 @@ class General(MixingConfig, BaseModel): chunking_overlap_window: Optional[int] = 5 """Size of the overlap window used for chunking""" test_size: float = 0.2 - last_train_on: Optional[int] = None + last_train_on: Optional[float] = None verbose_metrics: bool = False class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True @@ -29,5 +29,5 @@ class ConfigTransformersNER(MixingConfig, BaseModel): general: General = General() class Config: - extra = Extra.allow + extra = 'allow' validate_assignment = True diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py index 386bbe0c..9182fe00 100644 --- a/medcat/meta_cat.py +++ b/medcat/meta_cat.py @@ -114,8 +114,8 @@ def get_hash(self) -> str: """ hasher = Hasher() # Set last_train_on if None - if self.config.train['last_train_on'] is None: - self.config.train['last_train_on'] = datetime.now().timestamp() + if self.config.train.last_train_on is None: + self.config.train.last_train_on = datetime.now().timestamp() hasher.update(self.config.get_hash()) return hasher.hexdigest() @@ -310,7 +310,7 @@ def train_raw(self, data_loaded: Dict, save_dir_path: Optional[str] = None, data # Save everything now self.save(save_dir_path=save_dir_path) - self.config.train['last_train_on'] = datetime.now().timestamp() + self.config.train.last_train_on = datetime.now().timestamp() return report def eval(self, json_path: str) -> Dict: diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py index 1de8d6d8..392b4a94 100644 --- a/medcat/ner/transformers_ner.py +++ b/medcat/ner/transformers_ner.py @@ -103,8 +103,8 @@ def get_hash(self) -> str: """ hasher = Hasher() # Set last_train_on if None - if self.config.general['last_train_on'] is None: - self.config.general['last_train_on'] = datetime.now().timestamp() + if self.config.general.last_train_on is None: + self.config.general.last_train_on = datetime.now().timestamp() hasher.update(self.config.get_hash()) return hasher.hexdigest() @@ -242,7 +242,7 @@ def train(self, trainer.train() # type: ignore # Save the training time - self.config.general['last_train_on'] = datetime.now().timestamp() # type: ignore + self.config.general.last_train_on = datetime.now().timestamp() # type: ignore # Save everything self.save(save_dir_path=os.path.join(self.training_arguments.output_dir, 'final_model')) diff --git a/medcat/utils/regression/checking.py b/medcat/utils/regression/checking.py index 2c2d52ce..f470fc2c 100644 --- a/medcat/utils/regression/checking.py +++ b/medcat/utils/regression/checking.py @@ -411,7 +411,7 @@ def to_dict(self) -> dict: d = {} for case in self.cases: d[case.name] = case.to_dict() - d['meta'] = self.metadata.dict() + d['meta'] = self.metadata.model_dump() fix_np_float64(d['meta']) return d diff --git a/medcat/utils/regression/regression_checker.py b/medcat/utils/regression/regression_checker.py index 4906e9db..d494a40d 100644 --- a/medcat/utils/regression/regression_checker.py +++ b/medcat/utils/regression/regression_checker.py @@ -118,8 +118,8 @@ def main(model_pack_dir: Path, test_suite_file: Path, examples_strictness = Strictness[examples_strictness_str] if jsonpath: logger.info('Writing to %s', str(jsonpath)) - jsonpath.write_text(json.dumps(res.dict(strictness=examples_strictness), - indent=jsonindent)) + dumped = res.model_dump(strictness=examples_strictness) + jsonpath.write_text(json.dumps(dumped, indent=jsonindent)) else: logger.info(res.get_report(phrases_separately=phrases, hide_empty=hide_empty, examples_strictness=examples_strictness, diff --git a/medcat/utils/regression/results.py b/medcat/utils/regression/results.py index 2667a970..1a6bf893 100644 --- a/medcat/utils/regression/results.py +++ b/medcat/utils/regression/results.py @@ -1,5 +1,5 @@ from enum import Enum, auto -from typing import Dict, List, Optional, Any, Set, Iterable, Tuple +from typing import Dict, List, Optional, Any, Set, Iterable, Tuple, cast import json import pydantic @@ -372,7 +372,7 @@ def get_report(self) -> str: ]) return "\n".join(ret_vals) - def dict(self, **kwargs) -> dict: + def model_dump(self, **kwargs) -> dict: if 'strictness' in kwargs: kwargs = kwargs.copy() # so if used elsewhere, keeps the kwarg strict_raw = kwargs.pop('strictness') @@ -395,17 +395,17 @@ def dict(self, **kwargs) -> dict: key.name: value for key, value in self.findings.items() } serialized_examples = [ - (ft.dict(**kwargs), (f[0].name, f[1])) for ft, f in self.examples + (ft.model_dump(**kwargs), (f[0].name, f[1])) for ft, f in self.examples # only count if NOT in strictness matrix (i.e 'failures') if f[0] not in STRICTNESS_MATRIX[strictness] ] - model_dict = super().dict(**kwargs) + model_dict = cast(pydantic.BaseModel, super()).model_dump(**kwargs) model_dict['findings'] = serialized_dict model_dict['examples'] = serialized_examples return model_dict def json(self, **kwargs) -> str: - d = self.dict(**kwargs) + d = self.model_dump(**kwargs) return json.dumps(d) @@ -478,7 +478,7 @@ def get_report(self, phrases_separately: bool = False) -> str: for srd in self.per_phrase_results.values()]) return sr + '\n\t\t' + children.replace('\n', '\n\t\t') - def dict(self, **kwargs) -> dict: + def model_dump(self, **kwargs) -> dict: if 'exclude' in kwargs and kwargs['exclude'] is not None: exclude: set = kwargs['exclude'] else: @@ -486,7 +486,7 @@ def dict(self, **kwargs) -> dict: kwargs['exclude'] = exclude # NOTE: ignoring here so that examples are only present in the per phrase part exclude.update(('examples', 'per_phrase_results')) - d = super().dict(**kwargs) + d = cast(pydantic.BaseModel, super()).model_dump(**kwargs) if 'examples' in d: # NOTE: I don't really know why, but the examples still # seem to be a part of the resulting dict, so I need @@ -495,7 +495,7 @@ def dict(self, **kwargs) -> dict: # NOTE: need to propagate here manually so the strictness keyword # makes sense and doesn't cause issues due being to unexpected keyword per_phrase_results = { - phrase: res.dict(**kwargs) for phrase, res in + phrase: res.model_dump(**kwargs) for phrase, res in sorted(self.per_phrase_results.items(), key=lambda it: it[0]) } d['per_phrase_results'] = per_phrase_results @@ -677,7 +677,7 @@ def get_report(self, phrases_separately: bool, ]) return "\n".join(ret_vals) + f"\n{delegated}" - def dict(self, **kwargs) -> dict: + def model_dump(self, **kwargs) -> dict: if 'strictness' in kwargs: strict_raw = kwargs.pop('strictness') if isinstance(strict_raw, Strictness): @@ -688,8 +688,8 @@ def dict(self, **kwargs) -> dict: raise ValueError(f"Unknown stircntess specified: {strict_raw}") else: strictness = Strictness.NORMAL - out_dict = super().dict(exclude={'parts'}, **kwargs) - out_dict['parts'] = [part.dict(strictness=strictness) for part in self.parts] + out_dict = cast(pydantic.BaseModel, super()).model_dump(exclude={'parts'}, **kwargs) + out_dict['parts'] = [part.model_dump(strictness=strictness) for part in self.parts] return out_dict diff --git a/tests/stats/helpers.py b/tests/stats/helpers.py index 80771b11..af58517b 100644 --- a/tests/stats/helpers.py +++ b/tests/stats/helpers.py @@ -1,9 +1,13 @@ -from pydantic import create_model_from_typeddict +import pydantic +from unittest import TestCase + +import pydantic.error_wrappers from medcat.stats.mctexport import MedCATTrainerExport -MCTExportPydanticModel = create_model_from_typeddict(MedCATTrainerExport) +MCTExportPydanticModel = pydantic.TypeAdapter(MedCATTrainerExport) + def nullify_doc_names_proj_ids(export: MedCATTrainerExport) -> MedCATTrainerExport: @@ -15,3 +19,8 @@ def nullify_doc_names_proj_ids(export: MedCATTrainerExport) -> MedCATTrainerExpo ], key=lambda doc: doc['id']) } for project in export['projects'] ]} + + +def assert_is_mct_export(tc: TestCase, mct_export: dict): + model_instance = MCTExportPydanticModel.validate_python(mct_export) + tc.assertIsInstance(model_instance, dict) # NOTE: otherwise would have raised an exception diff --git a/tests/stats/test_kfold.py b/tests/stats/test_kfold.py index cae44248..d06b666e 100644 --- a/tests/stats/test_kfold.py +++ b/tests/stats/test_kfold.py @@ -5,11 +5,10 @@ from medcat.stats import kfold from medcat.cat import CAT -from pydantic.error_wrappers import ValidationError as PydanticValidationError import unittest -from .helpers import MCTExportPydanticModel, nullify_doc_names_proj_ids +from .helpers import assert_is_mct_export, nullify_doc_names_proj_ids class MCTExportTests(unittest.TestCase): @@ -22,11 +21,7 @@ def setUpClass(cls) -> None: cls.mct_export = json.load(f) def assertIsMCTExport(self, obj): - try: - model = MCTExportPydanticModel(**obj) - except PydanticValidationError as e: - raise AssertionError("Not n MCT export") from e - self.assertIsInstance(model, MCTExportPydanticModel) + assert_is_mct_export(self, obj) class KFoldCreatorTests(MCTExportTests): diff --git a/tests/stats/test_mctexport.py b/tests/stats/test_mctexport.py index 8ef11f55..924bbebf 100644 --- a/tests/stats/test_mctexport.py +++ b/tests/stats/test_mctexport.py @@ -5,7 +5,7 @@ import unittest -from .helpers import MCTExportPydanticModel +from .helpers import assert_is_mct_export class MCTExportIterationTests(unittest.TestCase): @@ -22,8 +22,7 @@ def setUpClass(cls) -> None: def test_conforms_to_template(self): # NOTE: This uses pydantic to make sure that the MedCATTrainerExport # type matches the actual export format - model_instance = MCTExportPydanticModel(**self.mct_export) - self.assertIsInstance(model_instance, MCTExportPydanticModel) + assert_is_mct_export(self, self.mct_export) def test_iterates_over_all_docs(self): self.assertEqual(mctexport.count_all_docs(self.mct_export), self.EXPECTED_DOCS) diff --git a/tests/utils/regression/test_checking.py b/tests/utils/regression/test_checking.py index 06ca933d..9cfffeeb 100644 --- a/tests/utils/regression/test_checking.py +++ b/tests/utils/regression/test_checking.py @@ -239,7 +239,7 @@ def setUpClass(cls) -> None: final_phrase='FINAL PHRASE'), finding=(Finding.FOUND_OTHER, 'CUI=OTHER')) def test_result_is_json_serialisable(self): - rd = self.res.dict() + rd = self.res.model_dump() s = json.dumps(rd) self.assertIsInstance(s, str) @@ -249,19 +249,19 @@ def test_result_is_json_serialisable_pydantic(self): def test_can_use_strictness(self): e1 = [ - example for part in self.res.dict(strictness=Strictness.STRICTEST)['parts'] + example for part in self.res.model_dump(strictness=Strictness.STRICTEST)['parts'] for per_phrase in part['per_phrase_results'].values() for example in per_phrase['examples'] ] e2 = [ - example for part in self.res.dict(strictness=Strictness.LENIENT)['parts'] + example for part in self.res.model_dump(strictness=Strictness.LENIENT)['parts'] for per_phrase in part['per_phrase_results'].values() for example in per_phrase['examples'] ] self.assertGreater(len(e1), len(e2)) def test_dict_includes_all_parts(self): - d_parts = self.res.dict()['parts'] + d_parts = self.res.model_dump()['parts'] self.assertEqual(len(self.res.parts), len(d_parts)) diff --git a/tests/utils/regression/test_results.py b/tests/utils/regression/test_results.py index d8e27054..7033e286 100644 --- a/tests/utils/regression/test_results.py +++ b/tests/utils/regression/test_results.py @@ -291,13 +291,13 @@ def test_can_json_dump_pydantic(self): self.assertIsInstance(s, str) def test_can_json_dump_json(self): - s = json.dumps(self.rd.dict()) + s = json.dumps(self.rd.model_dump()) self.assertIsInstance(s, str) def test_can_use_strictness_for_dump(self): - d_strictest = self.rd.dict(strictness='STRICTEST') + d_strictest = self.rd.model_dump(strictness='STRICTEST') e_strictest = d_strictest['examples'] # this should have more examples - d_lenient = self.rd.dict(strictness='NORMAL') + d_lenient = self.rd.model_dump(strictness='NORMAL') e_normal = d_lenient['examples'] self.assertGreater(len(e_strictest), len(e_normal))