Skip to content

Commit

Permalink
Elsevier: publication_info added artid, journal doctype, volume
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP committed Dec 11, 2023
1 parent 43b5912 commit bd417fe
Show file tree
Hide file tree
Showing 5 changed files with 1,989 additions and 22 deletions.
52 changes: 43 additions & 9 deletions dags/elsevier/metadata_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ class ElsevierMetadataParser(IParser):
def __init__(self, file_path) -> None:
self.file_path = file_path
self.year = None
self.journal_doctype = None
self.collaborations = []
self.logger = get_logger().bind(class_name=type(self).__name__)
self.extractors = [
CustomExtractor(
Expand All @@ -25,6 +23,11 @@ def __init__(self, file_path) -> None:
extraction_function=self._get_journal_title,
required=True,
),
CustomExtractor(
destination="journal_artid",
extraction_function=self._get_journal_aid,
required=True,
),
CustomExtractor(
destination="date_published",
extraction_function=self._get_published_date,
Expand All @@ -49,11 +52,16 @@ def __init__(self, file_path) -> None:
extraction_function=self._get_local_files,
required=True,
),
CustomExtractor(
destination="journal_volume",
extraction_function=self._get_journal_volume,
required=True,
),
]

def parse(self, article):
extracted_value = {}
journal_issues = article.findall("dataset-content/journal-item")
journal_issues = article.findall("dataset-content")
parsed_articles = []
for journal_issue in journal_issues:
extracted_value = {
Expand All @@ -65,7 +73,7 @@ def parse(self, article):
return parsed_articles

def _get_dois(self, article):
node = article.find("journal-item-unique-ids/doi")
node = article.find("journal-item/journal-item-unique-ids/doi")
if node is None:
return
dois = node.text
Expand All @@ -78,7 +86,7 @@ def _get_dois(self, article):
def _get_published_date(self, article):
date = extract_text(
article=article,
path="journal-item-properties/online-publication-date",
path="journal-item/journal-item-properties/online-publication-date",
field_name="published_date",
dois=self.dois,
)
Expand All @@ -100,13 +108,38 @@ def _get_date_published(self, article):
def _get_journal_title(self, article):
journal_title = extract_text(
article=article,
path="journal-item-unique-ids/jid-aid/jid",
field_name="collections",
path="journal-item/journal-item-unique-ids/jid-aid/jid",
field_name="journal_title",
dois=self.dois,
)
self.journal_title = journal_title
return journal_title

def _get_journal_aid(self, article):
journal_aid = extract_text(
article=article,
path="journal-item/journal-item-unique-ids/jid-aid/aid",
field_name="journal_aid",
dois=self.dois,
)
return journal_aid

def _get_journal_volume(self, article):
vol_first = extract_text(
article=article,
path="journal-issue/journal-issue-properties/volume-issue-number/vol-first",
field_name="volume_vol_first",
dois=self.dois,
)
suppl = extract_text(
article=article,
path="journal-issue/journal-issue-properties/volume-issue-number/suppl",
field_name="volume_suppl",
dois=self.dois,
)

return f"{vol_first} {suppl}"

def _get_collections(self, article):
return [self.journal_title]

Expand All @@ -123,12 +156,13 @@ def _get_local_files(self, article):
self.file_path = self.file_path.replace("raw/", "")

pdf_file_path = os.path.join(
self.file_path, article.find("files-info/web-pdf/pathname").text
self.file_path,
article.find("journal-item/files-info/web-pdf/pathname").text,
)
return {
"pdf": pdf_file_path,
"pdfa": os.path.join(os.path.split(pdf_file_path)[0], "main_a-2b.pdf"),
"xml": os.path.join(
self.file_path, article.find("files-info/ml/pathname").text
self.file_path, article.find("journal-item/files-info/ml/pathname").text
),
}
62 changes: 57 additions & 5 deletions dags/elsevier/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,45 @@ class ElsevierParser(IParser):
def __init__(self) -> None:
self.dois = None
self.year = None
self.journal_doctype = None
self.collaborations = []
self.logger = get_logger().bind(class_name=type(self).__name__)
self.article_type_mapping = {
"article": "article",
"sco": "article",
"fla": "article",
"abs": "article",
"rev": "article",
"add": "addendum",
"edb": "editorial",
"edi": "editorial",
"err": "erratum",
"ret": "retraction",
"rem": "retraction",
"adv": "other",
"ann": "other",
"brv": "other",
"cal": "other",
"chp": "other",
"cnf": "other",
"con": "other",
"cop": "other",
"cor": "other",
"crp": "other",
"dis": "other",
"dup": "other",
"exm": "other",
"ind": "other",
"lit": "other",
"mis": "other",
"nws": "other",
"ocn": "other",
"pgl": "other",
"pnt": "other",
"prp": "other",
"prv": "other",
"pub": "other",
"req": "other",
"ssu": "other",
}
extractors = [
CustomExtractor(
destination="dois",
Expand Down Expand Up @@ -54,9 +90,9 @@ def __init__(self) -> None:
destination="copyright_statement",
source="item-info/copyright",
),
TextExtractor(
destination="journal_artid",
source="item-info/aid",
CustomExtractor(
destination="journal_doctype",
extraction_function=self._get_journal_doctype,
),
]
super().__init__(extractors)
Expand Down Expand Up @@ -170,3 +206,19 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]):
"value": affiliation_value,
}
)

def _get_journal_doctype(self, article):
node = article.find(".")
value = node.get("docsubtype")
if not value:
self.logger.error("Article-type is not found in XML", dois=self.dois)
return None
try:
self.journal_doctype = self.article_type_mapping[value]
return self.journal_doctype
except KeyError:
self.logger.error(
"Unmapped article type", dois=self.dois, article_type=value
)
except Exception:
self.logger.error("Unknown error", dois=self.dois)
Loading

0 comments on commit bd417fe

Please sign in to comment.