Skip to content

Commit

Permalink
Elsevier: publication_info added artid, journal doctype, volume (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP authored Dec 11, 2023
1 parent 43b5912 commit b40a014
Show file tree
Hide file tree
Showing 5 changed files with 1,979 additions and 16 deletions.
38 changes: 34 additions & 4 deletions dags/elsevier/metadata_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ class ElsevierMetadataParser(IParser):
def __init__(self, file_path) -> None:
self.file_path = file_path
self.year = None
self.journal_doctype = None
self.collaborations = []
self.logger = get_logger().bind(class_name=type(self).__name__)
self.extractors = [
CustomExtractor(
Expand All @@ -25,6 +23,11 @@ def __init__(self, file_path) -> None:
extraction_function=self._get_journal_title,
required=True,
),
CustomExtractor(
destination="journal_artid",
extraction_function=self._get_journal_aid,
required=True,
),
CustomExtractor(
destination="date_published",
extraction_function=self._get_published_date,
Expand Down Expand Up @@ -61,6 +64,7 @@ def parse(self, article):
for extractor in self.extractors
if (value := extractor.extract(journal_issue)) is not None
}
extracted_value["journal_volume"] = self._get_journal_volume(article.find("dataset-content"))
parsed_articles.append(self._generic_parsing(extracted_value))
return parsed_articles

Expand Down Expand Up @@ -101,12 +105,37 @@ def _get_journal_title(self, article):
journal_title = extract_text(
article=article,
path="journal-item-unique-ids/jid-aid/jid",
field_name="collections",
field_name="journal_title",
dois=self.dois,
)
self.journal_title = journal_title
return journal_title

def _get_journal_aid(self, article):
journal_aid = extract_text(
article=article,
path="journal-item-unique-ids/jid-aid/aid",
field_name="journal_aid",
dois=self.dois,
)
return journal_aid

def _get_journal_volume(self, article):
vol_first = extract_text(
article=article,
path="journal-issue/journal-issue-properties/volume-issue-number/vol-first",
field_name="volume_vol_first",
dois=None
)
suppl = extract_text(
article=article,
path="journal-issue/journal-issue-properties/volume-issue-number/suppl",
field_name="volume_suppl",
dois=None
)

return f"{vol_first} {suppl}"

def _get_collections(self, article):
return [self.journal_title]

Expand All @@ -123,7 +152,8 @@ def _get_local_files(self, article):
self.file_path = self.file_path.replace("raw/", "")

pdf_file_path = os.path.join(
self.file_path, article.find("files-info/web-pdf/pathname").text
self.file_path,
article.find("files-info/web-pdf/pathname").text,
)
return {
"pdf": pdf_file_path,
Expand Down
62 changes: 57 additions & 5 deletions dags/elsevier/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,45 @@ class ElsevierParser(IParser):
def __init__(self) -> None:
self.dois = None
self.year = None
self.journal_doctype = None
self.collaborations = []
self.logger = get_logger().bind(class_name=type(self).__name__)
self.article_type_mapping = {
"article": "article",
"sco": "article",
"fla": "article",
"abs": "article",
"rev": "article",
"add": "addendum",
"edb": "editorial",
"edi": "editorial",
"err": "erratum",
"ret": "retraction",
"rem": "retraction",
"adv": "other",
"ann": "other",
"brv": "other",
"cal": "other",
"chp": "other",
"cnf": "other",
"con": "other",
"cop": "other",
"cor": "other",
"crp": "other",
"dis": "other",
"dup": "other",
"exm": "other",
"ind": "other",
"lit": "other",
"mis": "other",
"nws": "other",
"ocn": "other",
"pgl": "other",
"pnt": "other",
"prp": "other",
"prv": "other",
"pub": "other",
"req": "other",
"ssu": "other",
}
extractors = [
CustomExtractor(
destination="dois",
Expand Down Expand Up @@ -54,9 +90,9 @@ def __init__(self) -> None:
destination="copyright_statement",
source="item-info/copyright",
),
TextExtractor(
destination="journal_artid",
source="item-info/aid",
CustomExtractor(
destination="journal_doctype",
extraction_function=self._get_journal_doctype,
),
]
super().__init__(extractors)
Expand Down Expand Up @@ -170,3 +206,19 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]):
"value": affiliation_value,
}
)

def _get_journal_doctype(self, article):
node = article.find(".")
value = node.get("docsubtype")
if not value:
self.logger.error("Article-type is not found in XML", dois=self.dois)
return None
try:
self.journal_doctype = self.article_type_mapping[value]
return self.journal_doctype
except KeyError:
self.logger.error(
"Unmapped article type", dois=self.dois, article_type=value
)
except Exception:
self.logger.error("Unknown error", dois=self.dois)
Loading

0 comments on commit b40a014

Please sign in to comment.