Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP committed Dec 11, 2023
1 parent bd417fe commit 5638790
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 16 deletions.
2 changes: 1 addition & 1 deletion dags/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def parse_to_int(value):
logger.error("Cannot parse to integer", value=value)


def extract_text(article, path, field_name, dois):
def extract_text(article, path, field_name, dois="No DOI passed"):
try:
return article.find(path).text
except AttributeError:
Expand Down
22 changes: 8 additions & 14 deletions dags/elsevier/metadata_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,28 +52,24 @@ def __init__(self, file_path) -> None:
extraction_function=self._get_local_files,
required=True,
),
CustomExtractor(
destination="journal_volume",
extraction_function=self._get_journal_volume,
required=True,
),
]

def parse(self, article):
extracted_value = {}
journal_issues = article.findall("dataset-content")
journal_issues = article.findall("dataset-content/journal-item")
parsed_articles = []
for journal_issue in journal_issues:
extracted_value = {
extractor.destination: value
for extractor in self.extractors
if (value := extractor.extract(journal_issue)) is not None
}
extracted_value["journal_volume"] = self._get_journal_volume(article.find("dataset-content"))
parsed_articles.append(self._generic_parsing(extracted_value))
return parsed_articles

def _get_dois(self, article):
node = article.find("journal-item/journal-item-unique-ids/doi")
node = article.find("journal-item-unique-ids/doi")
if node is None:
return
dois = node.text
Expand All @@ -86,7 +82,7 @@ def _get_dois(self, article):
def _get_published_date(self, article):
date = extract_text(
article=article,
path="journal-item/journal-item-properties/online-publication-date",
path="journal-item-properties/online-publication-date",
field_name="published_date",
dois=self.dois,
)
Expand All @@ -108,7 +104,7 @@ def _get_date_published(self, article):
def _get_journal_title(self, article):
journal_title = extract_text(
article=article,
path="journal-item/journal-item-unique-ids/jid-aid/jid",
path="journal-item-unique-ids/jid-aid/jid",
field_name="journal_title",
dois=self.dois,
)
Expand All @@ -118,7 +114,7 @@ def _get_journal_title(self, article):
def _get_journal_aid(self, article):
journal_aid = extract_text(
article=article,
path="journal-item/journal-item-unique-ids/jid-aid/aid",
path="journal-item-unique-ids/jid-aid/aid",
field_name="journal_aid",
dois=self.dois,
)
Expand All @@ -129,13 +125,11 @@ def _get_journal_volume(self, article):
article=article,
path="journal-issue/journal-issue-properties/volume-issue-number/vol-first",
field_name="volume_vol_first",
dois=self.dois,
)
suppl = extract_text(
article=article,
path="journal-issue/journal-issue-properties/volume-issue-number/suppl",
field_name="volume_suppl",
dois=self.dois,
)

return f"{vol_first} {suppl}"
Expand All @@ -157,12 +151,12 @@ def _get_local_files(self, article):

pdf_file_path = os.path.join(
self.file_path,
article.find("journal-item/files-info/web-pdf/pathname").text,
article.find("files-info/web-pdf/pathname").text,
)
return {
"pdf": pdf_file_path,
"pdfa": os.path.join(os.path.split(pdf_file_path)[0], "main_a-2b.pdf"),
"xml": os.path.join(
self.file_path, article.find("journal-item/files-info/ml/pathname").text
self.file_path, article.find("files-info/ml/pathname").text
),
}
1 change: 1 addition & 0 deletions tests/units/elsevier/test_files_proccessing_tirgger.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_trigger_file_processing_elsevier(elsevier_empty_repo, migrated_files):
logger=get_logger().bind(class_name="elsevier_pull_ftp"),
filenames=migrated_files,
)
print(sorted(files))
assert sorted(files) == sorted(
[
"CERNQ000000010011/S0550321323000354/main.xml",
Expand Down
3 changes: 2 additions & 1 deletion tests/units/elsevier/test_metadata_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def parsed_articles(parser, article):
id="test_publication_info",
),
param(
["2023-02-04", "2023-11-02", "2023-02-04", "2023-11-02"],
["2023-11-02", "2023-11-02", "2023-02-04", "2023-11-02"],
"date_published",
id="test_published_date",
),
Expand Down Expand Up @@ -191,4 +191,5 @@ def test_elsevier_dataset_parsing_with_volume(
for (parsed_article, expected_article) in zip(
parsed_articles_with_volume, expected
):
print(parsed_article["publication_info"])
assert expected_article == parsed_article[key]

0 comments on commit 5638790

Please sign in to comment.