Skip to content

Commit

Permalink
Truncate the fields in the Publication model if longer than expected.
Browse files Browse the repository at this point in the history
I've increased the max len of the pub_type in the Publication model.
To prevent the import to fail in the future I've implemented a brute
truncation mechanism when a Publication is saved.
  • Loading branch information
mberacochea committed Nov 7, 2023
1 parent 61af088 commit 3ee7e1d
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 48 deletions.
Empty file added emgapi/mgx.py
Empty file.
13 changes: 12 additions & 1 deletion emgapi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,14 +502,25 @@ class Publication(models.Model):
db_column='PUBLISHED_YEAR', blank=True, null=True,
help_text='Published year')
pub_type = models.CharField(
db_column='PUB_TYPE', max_length=150, blank=True, null=True)
db_column='PUB_TYPE', max_length=300, blank=True, null=True)

objects = PublicationManager()

class Meta:
db_table = 'PUBLICATION'
ordering = ('pubmed_id',)

def save(self, *args, **kwargs):
for field in self._meta.fields:
if isinstance(field, models.TextField) or isinstance(field, models.CharField):
field_name = field.name
max_length = field.max_length
field_value = getattr(self, field_name)
if field_value and len(field_value) > max_length:
logger.error(f"Publication field {field_name} content was truncated at {max_length}")
setattr(self, field_name, field_value[:max_length])
super(Publication, self).save(*args, **kwargs)

def __str__(self):
return str(self.pubmed_id)

Expand Down
35 changes: 19 additions & 16 deletions emgapianns/management/commands/import_publication.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
import logging
from django.core.management import BaseCommand
from emgapi import models as emg_models
from emgapianns.management.lib.europe_pmc_api.europe_pmc_api_handler import EuropePMCApiHandler
from emgapianns.management.lib.europe_pmc_api.europe_pmc_api_handler import (
EuropePMCApiHandler,
)

logger = logging.getLogger(__name__)

Expand All @@ -29,15 +31,17 @@ def lookup_publication_by_pubmed_id(pubmed_id):
def update_or_create_publication(publication):
return emg_models.Publication.objects.update_or_create(
pubmed_id=publication.pmid,
defaults={'authors': publication.author_string,
'doi': publication.doi,
'isbn': publication.journal_issn,
'iso_journal': publication.journal_title,
'pub_title': publication.title,
'raw_pages': publication.page_info,
'volume': publication.journal_volume,
'published_year': publication.pub_year,
'pub_type': publication.pub_type},
defaults={
"authors": publication.author_string,
"doi": publication.doi,
"isbn": publication.journal_issn,
"iso_journal": publication.journal_title,
"pub_title": publication.title,
"raw_pages": publication.page_info,
"volume": publication.journal_volume,
"published_year": publication.pub_year,
"pub_type": publication.pub_type,
},
)


Expand All @@ -47,19 +51,18 @@ def lookup_publication_by_project_id(project_id):


class Command(BaseCommand):
help = 'Creates or updates a publication in EMG.'
help = "Creates or updates a publication in EMG."

def add_arguments(self, parser):
# TODO: Consider lookup by project id
parser.add_argument('pubmed-id',
help='PubMed identifier (PMID)',
type=int,
action='store')
parser.add_argument(
"pubmed-id", help="PubMed identifier (PMID)", type=int, action="store"
)

def handle(self, *args, **options):
logger.info("CLI %r" % options)

pubmed_id = options['pubmed-id']
pubmed_id = options["pubmed-id"]
publications = lookup_publication_by_pubmed_id(pubmed_id)
for publication in publications:
update_or_create_publication(publication)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2019-2022 EMBL - European Bioinformatics Institute
# Copyright 2019-2023 EMBL - European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -34,7 +34,7 @@ def get_default_connection_headers():
}


class Publication(object):
class Publication:
def __init__(
self,
pub_year,
Expand Down
89 changes: 60 additions & 29 deletions tests/webuploader/test_import_publication.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,39 @@

import pytest

from emgapianns.management.commands.import_publication import lookup_publication_by_pubmed_id


@pytest.mark.parametrize("pubmed_id, expected_pub_title, expected_year_of_pub, expected_authors, expected_doi", [
(4838818,
"Proceedings: The morphological variation of nervous structures in the atrial endocardium of the dog.",
1974,
"Floyd K, Linden RJ, Saunders DA.",
"n/a"),
(31138692,
"Mechanisms by which sialylated milk oligosaccharides impact bone biology in a gnotobiotic mouse "
"model of infant undernutrition.",
2019,
"Cowardin CA, Ahern PP, Kung VL, Hibberd MC, Cheng J, Guruge JL, Sundaresan V, Head RD, Barile D,"
" Mills DA, Barratt MJ, Huq S, Ahmed T, Gordon JI.",
"10.1073/pnas.1821770116")
])
def test_lookup_publication_by_pubmed_id_should_return(pubmed_id,
expected_pub_title,
expected_year_of_pub,
expected_authors,
expected_doi):
from emgapi.models import Publication
from model_bakery import baker


from emgapianns.management.commands.import_publication import (
lookup_publication_by_pubmed_id,
)


@pytest.mark.parametrize(
"pubmed_id, expected_pub_title, expected_year_of_pub, expected_authors, expected_doi",
[
(
4838818,
"Proceedings: The morphological variation of nervous structures in the atrial endocardium of the dog.",
1974,
"Floyd K, Linden RJ, Saunders DA.",
"n/a",
),
(
31138692,
"Mechanisms by which sialylated milk oligosaccharides impact bone biology in a gnotobiotic mouse "
"model of infant undernutrition.",
2019,
"Cowardin CA, Ahern PP, Kung VL, Hibberd MC, Cheng J, Guruge JL, Sundaresan V, Head RD, Barile D,"
" Mills DA, Barratt MJ, Huq S, Ahmed T, Gordon JI.",
"10.1073/pnas.1821770116",
),
],
)
def test_lookup_publication_by_pubmed_id_should_return(
pubmed_id, expected_pub_title, expected_year_of_pub, expected_authors, expected_doi
):
publications = lookup_publication_by_pubmed_id(pubmed_id)
assert len(publications) == 1

Expand All @@ -49,18 +60,38 @@ def test_lookup_publication_by_pubmed_id_should_return(pubmed_id,
assert publication.doi == expected_doi


@pytest.mark.parametrize("pubmed_id", [
(0),
(000)
])
@pytest.mark.parametrize("pubmed_id", [(0), (000)])
def test_lookup_publication_by_pubmed_id_(pubmed_id):
with pytest.raises(ValueError):
lookup_publication_by_pubmed_id(pubmed_id)


@pytest.mark.parametrize("pubmed_id", [
("test")
])
@pytest.mark.parametrize("pubmed_id", [("test")])
def test_lookup_publication_by_pubmed_id_raises_exception_on_string(pubmed_id):
with pytest.raises(TypeError):
lookup_publication_by_pubmed_id(pubmed_id)


@pytest.mark.django_db
def test_text_fields_longer_than_expected(faker):
PUB_TITLE_MAX = 740
PUB_TYPE_MAX = 300
VOLUME_MAX = 55

# I've picked 3 fields as representatives
publications = baker.prepare(
Publication,
pub_title=faker.text(max_nb_chars=PUB_TITLE_MAX + 1000),
pub_type=faker.text(max_nb_chars=PUB_TYPE_MAX + 1000),
volume=faker.text(max_nb_chars=VOLUME_MAX + 1000),
_quantity=5,
)

for publication in publications:
assert len(publication.pub_title) > PUB_TITLE_MAX
assert len(publication.pub_type) > PUB_TYPE_MAX
assert len(publication.volume) > VOLUME_MAX
publication.save()
assert len(publication.pub_title) <= PUB_TITLE_MAX
assert len(publication.pub_type) <= PUB_TYPE_MAX
assert len(publication.volume) <= VOLUME_MAX

0 comments on commit 3ee7e1d

Please sign in to comment.