From 8d9a9497386ce39cf5136b3765d05a5d854e16b4 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Fri, 29 Sep 2023 13:43:03 +0100 Subject: [PATCH 1/4] Truncate the fields in the Publication model if longer than expected. I've increased the max len of the pub_type in the Publication model. To prevent the import to fail in the future I've implemented a brute truncation mechanism when a Publication is saved. --- emgapi/mgx.py | 0 emgapi/models.py | 13 ++- .../management/commands/import_publication.py | 35 ++++---- .../europe_pmc_api/europe_pmc_api_handler.py | 4 +- tests/webuploader/test_import_publication.py | 89 +++++++++++++------ 5 files changed, 93 insertions(+), 48 deletions(-) create mode 100644 emgapi/mgx.py diff --git a/emgapi/mgx.py b/emgapi/mgx.py new file mode 100644 index 000000000..e69de29bb diff --git a/emgapi/models.py b/emgapi/models.py index 1faf824ea..1cf7f0321 100644 --- a/emgapi/models.py +++ b/emgapi/models.py @@ -502,7 +502,7 @@ class Publication(models.Model): db_column='PUBLISHED_YEAR', blank=True, null=True, help_text='Published year') pub_type = models.CharField( - db_column='PUB_TYPE', max_length=150, blank=True, null=True) + db_column='PUB_TYPE', max_length=300, blank=True, null=True) objects = PublicationManager() @@ -510,6 +510,17 @@ class Meta: db_table = 'PUBLICATION' ordering = ('pubmed_id',) + def save(self, *args, **kwargs): + for field in self._meta.fields: + if isinstance(field, models.TextField) or isinstance(field, models.CharField): + field_name = field.name + max_length = field.max_length + field_value = getattr(self, field_name) + if field_value and len(field_value) > max_length: + logger.error(f"Publication field {field_name} content was truncated at {max_length}") + setattr(self, field_name, field_value[:max_length]) + super(Publication, self).save(*args, **kwargs) + def __str__(self): return str(self.pubmed_id) diff --git a/emgapianns/management/commands/import_publication.py b/emgapianns/management/commands/import_publication.py index 3bc8d52c0..55d064d39 100644 --- a/emgapianns/management/commands/import_publication.py +++ b/emgapianns/management/commands/import_publication.py @@ -16,7 +16,9 @@ import logging from django.core.management import BaseCommand from emgapi import models as emg_models -from emgapianns.management.lib.europe_pmc_api.europe_pmc_api_handler import EuropePMCApiHandler +from emgapianns.management.lib.europe_pmc_api.europe_pmc_api_handler import ( + EuropePMCApiHandler, +) logger = logging.getLogger(__name__) @@ -29,15 +31,17 @@ def lookup_publication_by_pubmed_id(pubmed_id): def update_or_create_publication(publication): return emg_models.Publication.objects.update_or_create( pubmed_id=publication.pmid, - defaults={'authors': publication.author_string, - 'doi': publication.doi, - 'isbn': publication.journal_issn, - 'iso_journal': publication.journal_title, - 'pub_title': publication.title, - 'raw_pages': publication.page_info, - 'volume': publication.journal_volume, - 'published_year': publication.pub_year, - 'pub_type': publication.pub_type}, + defaults={ + "authors": publication.author_string, + "doi": publication.doi, + "isbn": publication.journal_issn, + "iso_journal": publication.journal_title, + "pub_title": publication.title, + "raw_pages": publication.page_info, + "volume": publication.journal_volume, + "published_year": publication.pub_year, + "pub_type": publication.pub_type, + }, ) @@ -47,19 +51,18 @@ def lookup_publication_by_project_id(project_id): class Command(BaseCommand): - help = 'Creates or updates a publication in EMG.' + help = "Creates or updates a publication in EMG." def add_arguments(self, parser): # TODO: Consider lookup by project id - parser.add_argument('pubmed-id', - help='PubMed identifier (PMID)', - type=int, - action='store') + parser.add_argument( + "pubmed-id", help="PubMed identifier (PMID)", type=int, action="store" + ) def handle(self, *args, **options): logger.info("CLI %r" % options) - pubmed_id = options['pubmed-id'] + pubmed_id = options["pubmed-id"] publications = lookup_publication_by_pubmed_id(pubmed_id) for publication in publications: update_or_create_publication(publication) diff --git a/emgapianns/management/lib/europe_pmc_api/europe_pmc_api_handler.py b/emgapianns/management/lib/europe_pmc_api/europe_pmc_api_handler.py index 7c0a5b13e..cacfb308b 100644 --- a/emgapianns/management/lib/europe_pmc_api/europe_pmc_api_handler.py +++ b/emgapianns/management/lib/europe_pmc_api/europe_pmc_api_handler.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright 2019-2022 EMBL - European Bioinformatics Institute +# Copyright 2019-2023 EMBL - European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ def get_default_connection_headers(): } -class Publication(object): +class Publication: def __init__( self, pub_year, diff --git a/tests/webuploader/test_import_publication.py b/tests/webuploader/test_import_publication.py index 29531aaf8..13bb9ff40 100644 --- a/tests/webuploader/test_import_publication.py +++ b/tests/webuploader/test_import_publication.py @@ -16,28 +16,39 @@ import pytest -from emgapianns.management.commands.import_publication import lookup_publication_by_pubmed_id - - -@pytest.mark.parametrize("pubmed_id, expected_pub_title, expected_year_of_pub, expected_authors, expected_doi", [ - (4838818, - "Proceedings: The morphological variation of nervous structures in the atrial endocardium of the dog.", - 1974, - "Floyd K, Linden RJ, Saunders DA.", - "n/a"), - (31138692, - "Mechanisms by which sialylated milk oligosaccharides impact bone biology in a gnotobiotic mouse " - "model of infant undernutrition.", - 2019, - "Cowardin CA, Ahern PP, Kung VL, Hibberd MC, Cheng J, Guruge JL, Sundaresan V, Head RD, Barile D," - " Mills DA, Barratt MJ, Huq S, Ahmed T, Gordon JI.", - "10.1073/pnas.1821770116") -]) -def test_lookup_publication_by_pubmed_id_should_return(pubmed_id, - expected_pub_title, - expected_year_of_pub, - expected_authors, - expected_doi): +from emgapi.models import Publication +from model_bakery import baker + + +from emgapianns.management.commands.import_publication import ( + lookup_publication_by_pubmed_id, +) + + +@pytest.mark.parametrize( + "pubmed_id, expected_pub_title, expected_year_of_pub, expected_authors, expected_doi", + [ + ( + 4838818, + "Proceedings: The morphological variation of nervous structures in the atrial endocardium of the dog.", + 1974, + "Floyd K, Linden RJ, Saunders DA.", + "n/a", + ), + ( + 31138692, + "Mechanisms by which sialylated milk oligosaccharides impact bone biology in a gnotobiotic mouse " + "model of infant undernutrition.", + 2019, + "Cowardin CA, Ahern PP, Kung VL, Hibberd MC, Cheng J, Guruge JL, Sundaresan V, Head RD, Barile D," + " Mills DA, Barratt MJ, Huq S, Ahmed T, Gordon JI.", + "10.1073/pnas.1821770116", + ), + ], +) +def test_lookup_publication_by_pubmed_id_should_return( + pubmed_id, expected_pub_title, expected_year_of_pub, expected_authors, expected_doi +): publications = lookup_publication_by_pubmed_id(pubmed_id) assert len(publications) == 1 @@ -49,18 +60,38 @@ def test_lookup_publication_by_pubmed_id_should_return(pubmed_id, assert publication.doi == expected_doi -@pytest.mark.parametrize("pubmed_id", [ - (0), - (000) -]) +@pytest.mark.parametrize("pubmed_id", [(0), (000)]) def test_lookup_publication_by_pubmed_id_(pubmed_id): with pytest.raises(ValueError): lookup_publication_by_pubmed_id(pubmed_id) -@pytest.mark.parametrize("pubmed_id", [ - ("test") -]) +@pytest.mark.parametrize("pubmed_id", [("test")]) def test_lookup_publication_by_pubmed_id_raises_exception_on_string(pubmed_id): with pytest.raises(TypeError): lookup_publication_by_pubmed_id(pubmed_id) + + +@pytest.mark.django_db +def test_text_fields_longer_than_expected(faker): + PUB_TITLE_MAX = 740 + PUB_TYPE_MAX = 300 + VOLUME_MAX = 55 + + # I've picked 3 fields as representatives + publications = baker.prepare( + Publication, + pub_title=faker.text(max_nb_chars=PUB_TITLE_MAX + 1000), + pub_type=faker.text(max_nb_chars=PUB_TYPE_MAX + 1000), + volume=faker.text(max_nb_chars=VOLUME_MAX + 1000), + _quantity=5, + ) + + for publication in publications: + assert len(publication.pub_title) > PUB_TITLE_MAX + assert len(publication.pub_type) > PUB_TYPE_MAX + assert len(publication.volume) > VOLUME_MAX + publication.save() + assert len(publication.pub_title) <= PUB_TITLE_MAX + assert len(publication.pub_type) <= PUB_TYPE_MAX + assert len(publication.volume) <= VOLUME_MAX From 62dfd01e270bc63ab3da088a1ce0c7e16a779b1b Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Wed, 8 Nov 2023 08:14:26 +0000 Subject: [PATCH 2/4] The corresponding migration for the Publication.pub_type change --- .../0011_alter_publication_pub_type.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 emgapi/migrations/0011_alter_publication_pub_type.py diff --git a/emgapi/migrations/0011_alter_publication_pub_type.py b/emgapi/migrations/0011_alter_publication_pub_type.py new file mode 100644 index 000000000..632263679 --- /dev/null +++ b/emgapi/migrations/0011_alter_publication_pub_type.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.18 on 2023-11-08 08:13 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('emgapi', '0010_runextraannotation'), + ] + + operations = [ + migrations.AlterField( + model_name='publication', + name='pub_type', + field=models.CharField(blank=True, db_column='PUB_TYPE', max_length=300, null=True), + ), + ] From 8967eda0f9ab807c00bc55bbe4376ea94c3015e4 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Wed, 8 Nov 2023 09:46:39 +0000 Subject: [PATCH 3/4] Forgot to add Faker test dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index bbc991dca..b5d27771b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ tests = [ "pytest-cov==2.12.1", "pandas==1.3.2", "responses==0.23.1", + "Faker==19.6.2", ] dev = [ From c3dc316658a32a2d4158a13af9a764aa77d587b6 Mon Sep 17 00:00:00 2001 From: Martin Beracochea Date: Wed, 8 Nov 2023 09:58:21 +0000 Subject: [PATCH 4/4] Rebase dev and fixed the conflicting migration --- ...ication_pub_type.py => 0012_alter_publication_pub_type.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename emgapi/migrations/{0011_alter_publication_pub_type.py => 0012_alter_publication_pub_type.py} (75%) diff --git a/emgapi/migrations/0011_alter_publication_pub_type.py b/emgapi/migrations/0012_alter_publication_pub_type.py similarity index 75% rename from emgapi/migrations/0011_alter_publication_pub_type.py rename to emgapi/migrations/0012_alter_publication_pub_type.py index 632263679..1d61c5fb2 100644 --- a/emgapi/migrations/0011_alter_publication_pub_type.py +++ b/emgapi/migrations/0012_alter_publication_pub_type.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.18 on 2023-11-08 08:13 +# Generated by Django 3.2.18 on 2023-11-08 09:57 from django.db import migrations, models @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ('emgapi', '0010_runextraannotation'), + ('emgapi', '0011_analysisjob_analysis_summary_json'), ] operations = [