From 56387909fddb8d9f4e50d84f1c6ed05520c8f10b Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Mon, 11 Dec 2023 15:39:19 +0100 Subject: [PATCH] fix --- dags/common/utils.py | 2 +- dags/elsevier/metadata_parser.py | 22 +++++++------------ .../test_files_proccessing_tirgger.py | 1 + tests/units/elsevier/test_metadata_parser.py | 3 ++- 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/dags/common/utils.py b/dags/common/utils.py index de04535c..022dbaa5 100644 --- a/dags/common/utils.py +++ b/dags/common/utils.py @@ -72,7 +72,7 @@ def parse_to_int(value): logger.error("Cannot parse to integer", value=value) -def extract_text(article, path, field_name, dois): +def extract_text(article, path, field_name, dois="No DOI passed"): try: return article.find(path).text except AttributeError: diff --git a/dags/elsevier/metadata_parser.py b/dags/elsevier/metadata_parser.py index 52b82462..1d191342 100644 --- a/dags/elsevier/metadata_parser.py +++ b/dags/elsevier/metadata_parser.py @@ -52,16 +52,11 @@ def __init__(self, file_path) -> None: extraction_function=self._get_local_files, required=True, ), - CustomExtractor( - destination="journal_volume", - extraction_function=self._get_journal_volume, - required=True, - ), ] def parse(self, article): extracted_value = {} - journal_issues = article.findall("dataset-content") + journal_issues = article.findall("dataset-content/journal-item") parsed_articles = [] for journal_issue in journal_issues: extracted_value = { @@ -69,11 +64,12 @@ def parse(self, article): for extractor in self.extractors if (value := extractor.extract(journal_issue)) is not None } + extracted_value["journal_volume"] = self._get_journal_volume(article.find("dataset-content")) parsed_articles.append(self._generic_parsing(extracted_value)) return parsed_articles def _get_dois(self, article): - node = article.find("journal-item/journal-item-unique-ids/doi") + node = article.find("journal-item-unique-ids/doi") if node is None: return dois = node.text @@ -86,7 +82,7 @@ def _get_dois(self, article): def _get_published_date(self, article): date = extract_text( article=article, - path="journal-item/journal-item-properties/online-publication-date", + path="journal-item-properties/online-publication-date", field_name="published_date", dois=self.dois, ) @@ -108,7 +104,7 @@ def _get_date_published(self, article): def _get_journal_title(self, article): journal_title = extract_text( article=article, - path="journal-item/journal-item-unique-ids/jid-aid/jid", + path="journal-item-unique-ids/jid-aid/jid", field_name="journal_title", dois=self.dois, ) @@ -118,7 +114,7 @@ def _get_journal_title(self, article): def _get_journal_aid(self, article): journal_aid = extract_text( article=article, - path="journal-item/journal-item-unique-ids/jid-aid/aid", + path="journal-item-unique-ids/jid-aid/aid", field_name="journal_aid", dois=self.dois, ) @@ -129,13 +125,11 @@ def _get_journal_volume(self, article): article=article, path="journal-issue/journal-issue-properties/volume-issue-number/vol-first", field_name="volume_vol_first", - dois=self.dois, ) suppl = extract_text( article=article, path="journal-issue/journal-issue-properties/volume-issue-number/suppl", field_name="volume_suppl", - dois=self.dois, ) return f"{vol_first} {suppl}" @@ -157,12 +151,12 @@ def _get_local_files(self, article): pdf_file_path = os.path.join( self.file_path, - article.find("journal-item/files-info/web-pdf/pathname").text, + article.find("files-info/web-pdf/pathname").text, ) return { "pdf": pdf_file_path, "pdfa": os.path.join(os.path.split(pdf_file_path)[0], "main_a-2b.pdf"), "xml": os.path.join( - self.file_path, article.find("journal-item/files-info/ml/pathname").text + self.file_path, article.find("files-info/ml/pathname").text ), } diff --git a/tests/units/elsevier/test_files_proccessing_tirgger.py b/tests/units/elsevier/test_files_proccessing_tirgger.py index c7e2308d..409f14ea 100644 --- a/tests/units/elsevier/test_files_proccessing_tirgger.py +++ b/tests/units/elsevier/test_files_proccessing_tirgger.py @@ -42,6 +42,7 @@ def test_trigger_file_processing_elsevier(elsevier_empty_repo, migrated_files): logger=get_logger().bind(class_name="elsevier_pull_ftp"), filenames=migrated_files, ) + print(sorted(files)) assert sorted(files) == sorted( [ "CERNQ000000010011/S0550321323000354/main.xml", diff --git a/tests/units/elsevier/test_metadata_parser.py b/tests/units/elsevier/test_metadata_parser.py index 1ea24f4c..6a4043c1 100644 --- a/tests/units/elsevier/test_metadata_parser.py +++ b/tests/units/elsevier/test_metadata_parser.py @@ -75,7 +75,7 @@ def parsed_articles(parser, article): id="test_publication_info", ), param( - ["2023-02-04", "2023-11-02", "2023-02-04", "2023-11-02"], + ["2023-11-02", "2023-11-02", "2023-02-04", "2023-11-02"], "date_published", id="test_published_date", ), @@ -191,4 +191,5 @@ def test_elsevier_dataset_parsing_with_volume( for (parsed_article, expected_article) in zip( parsed_articles_with_volume, expected ): + print(parsed_article["publication_info"]) assert expected_article == parsed_article[key]