Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Elsevier: publication_info added artid, journal doctype, volume #186

Merged
merged 1 commit into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions dags/elsevier/metadata_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ class ElsevierMetadataParser(IParser):
def __init__(self, file_path) -> None:
self.file_path = file_path
self.year = None
self.journal_doctype = None
self.collaborations = []
self.logger = get_logger().bind(class_name=type(self).__name__)
self.extractors = [
CustomExtractor(
Expand All @@ -25,6 +23,11 @@ def __init__(self, file_path) -> None:
extraction_function=self._get_journal_title,
required=True,
),
CustomExtractor(
destination="journal_artid",
extraction_function=self._get_journal_aid,
required=True,
),
CustomExtractor(
destination="date_published",
extraction_function=self._get_published_date,
Expand Down Expand Up @@ -61,6 +64,7 @@ def parse(self, article):
for extractor in self.extractors
if (value := extractor.extract(journal_issue)) is not None
}
extracted_value["journal_volume"] = self._get_journal_volume(article.find("dataset-content"))
parsed_articles.append(self._generic_parsing(extracted_value))
return parsed_articles

Expand Down Expand Up @@ -101,12 +105,37 @@ def _get_journal_title(self, article):
journal_title = extract_text(
article=article,
path="journal-item-unique-ids/jid-aid/jid",
field_name="collections",
field_name="journal_title",
dois=self.dois,
)
self.journal_title = journal_title
return journal_title

def _get_journal_aid(self, article):
journal_aid = extract_text(
article=article,
path="journal-item-unique-ids/jid-aid/aid",
field_name="journal_aid",
dois=self.dois,
)
return journal_aid

def _get_journal_volume(self, article):
vol_first = extract_text(
article=article,
path="journal-issue/journal-issue-properties/volume-issue-number/vol-first",
field_name="volume_vol_first",
dois=None
)
suppl = extract_text(
article=article,
path="journal-issue/journal-issue-properties/volume-issue-number/suppl",
field_name="volume_suppl",
dois=None
)

return f"{vol_first} {suppl}"

def _get_collections(self, article):
return [self.journal_title]

Expand All @@ -123,7 +152,8 @@ def _get_local_files(self, article):
self.file_path = self.file_path.replace("raw/", "")

pdf_file_path = os.path.join(
self.file_path, article.find("files-info/web-pdf/pathname").text
self.file_path,
article.find("files-info/web-pdf/pathname").text,
)
return {
"pdf": pdf_file_path,
Expand Down
62 changes: 57 additions & 5 deletions dags/elsevier/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,45 @@ class ElsevierParser(IParser):
def __init__(self) -> None:
self.dois = None
self.year = None
self.journal_doctype = None
self.collaborations = []
self.logger = get_logger().bind(class_name=type(self).__name__)
self.article_type_mapping = {
"article": "article",
"sco": "article",
"fla": "article",
"abs": "article",
"rev": "article",
"add": "addendum",
"edb": "editorial",
"edi": "editorial",
"err": "erratum",
"ret": "retraction",
"rem": "retraction",
"adv": "other",
"ann": "other",
"brv": "other",
"cal": "other",
"chp": "other",
"cnf": "other",
"con": "other",
"cop": "other",
"cor": "other",
"crp": "other",
"dis": "other",
"dup": "other",
"exm": "other",
"ind": "other",
"lit": "other",
"mis": "other",
"nws": "other",
"ocn": "other",
"pgl": "other",
"pnt": "other",
"prp": "other",
"prv": "other",
"pub": "other",
"req": "other",
"ssu": "other",
}
extractors = [
CustomExtractor(
destination="dois",
Expand Down Expand Up @@ -54,9 +90,9 @@ def __init__(self) -> None:
destination="copyright_statement",
source="item-info/copyright",
),
TextExtractor(
destination="journal_artid",
source="item-info/aid",
CustomExtractor(
destination="journal_doctype",
extraction_function=self._get_journal_doctype,
),
]
super().__init__(extractors)
Expand Down Expand Up @@ -170,3 +206,19 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]):
"value": affiliation_value,
}
)

def _get_journal_doctype(self, article):
node = article.find(".")
value = node.get("docsubtype")
if not value:
self.logger.error("Article-type is not found in XML", dois=self.dois)
return None
try:
self.journal_doctype = self.article_type_mapping[value]
return self.journal_doctype
except KeyError:
self.logger.error(
"Unmapped article type", dois=self.dois, article_type=value
)
except Exception:
self.logger.error("Unknown error", dois=self.dois)
Loading