Elsevier: publication_info added artid, journal doctype, volume

cern-sis · Dec 11, 2023 · bd417fe · bd417fe
1 parent 43b5912
commit bd417fe
Show file tree

Hide file tree

Showing 5 changed files with 1,989 additions and 22 deletions.
diff --git a/dags/elsevier/metadata_parser.py b/dags/elsevier/metadata_parser.py
@@ -11,8 +11,6 @@ class ElsevierMetadataParser(IParser):
     def __init__(self, file_path) -> None:
         self.file_path = file_path
         self.year = None
-        self.journal_doctype = None
-        self.collaborations = []
         self.logger = get_logger().bind(class_name=type(self).__name__)
         self.extractors = [
             CustomExtractor(
@@ -25,6 +23,11 @@ def __init__(self, file_path) -> None:
                 extraction_function=self._get_journal_title,
                 required=True,
             ),
+            CustomExtractor(
+                destination="journal_artid",
+                extraction_function=self._get_journal_aid,
+                required=True,
+            ),
             CustomExtractor(
                 destination="date_published",
                 extraction_function=self._get_published_date,
@@ -49,11 +52,16 @@ def __init__(self, file_path) -> None:
                 extraction_function=self._get_local_files,
                 required=True,
             ),
+            CustomExtractor(
+                destination="journal_volume",
+                extraction_function=self._get_journal_volume,
+                required=True,
+            ),
         ]
 
     def parse(self, article):
         extracted_value = {}
-        journal_issues = article.findall("dataset-content/journal-item")
+        journal_issues = article.findall("dataset-content")
         parsed_articles = []
         for journal_issue in journal_issues:
             extracted_value = {
@@ -65,7 +73,7 @@ def parse(self, article):
         return parsed_articles
 
     def _get_dois(self, article):
-        node = article.find("journal-item-unique-ids/doi")
+        node = article.find("journal-item/journal-item-unique-ids/doi")
         if node is None:
             return
         dois = node.text
@@ -78,7 +86,7 @@ def _get_dois(self, article):
     def _get_published_date(self, article):
         date = extract_text(
             article=article,
-            path="journal-item-properties/online-publication-date",
+            path="journal-item/journal-item-properties/online-publication-date",
             field_name="published_date",
             dois=self.dois,
         )
@@ -100,13 +108,38 @@ def _get_date_published(self, article):
     def _get_journal_title(self, article):
         journal_title = extract_text(
             article=article,
-            path="journal-item-unique-ids/jid-aid/jid",
-            field_name="collections",
+            path="journal-item/journal-item-unique-ids/jid-aid/jid",
+            field_name="journal_title",
             dois=self.dois,
         )
         self.journal_title = journal_title
         return journal_title
 
+    def _get_journal_aid(self, article):
+        journal_aid = extract_text(
+            article=article,
+            path="journal-item/journal-item-unique-ids/jid-aid/aid",
+            field_name="journal_aid",
+            dois=self.dois,
+        )
+        return journal_aid
+
+    def _get_journal_volume(self, article):
+        vol_first = extract_text(
+            article=article,
+            path="journal-issue/journal-issue-properties/volume-issue-number/vol-first",
+            field_name="volume_vol_first",
+            dois=self.dois,
+        )
+        suppl = extract_text(
+            article=article,
+            path="journal-issue/journal-issue-properties/volume-issue-number/suppl",
+            field_name="volume_suppl",
+            dois=self.dois,
+        )
+
+        return f"{vol_first} {suppl}"
+
     def _get_collections(self, article):
         return [self.journal_title]
 
@@ -123,12 +156,13 @@ def _get_local_files(self, article):
             self.file_path = self.file_path.replace("raw/", "")
 
         pdf_file_path = os.path.join(
-            self.file_path, article.find("files-info/web-pdf/pathname").text
+            self.file_path,
+            article.find("journal-item/files-info/web-pdf/pathname").text,
         )
         return {
             "pdf": pdf_file_path,
             "pdfa": os.path.join(os.path.split(pdf_file_path)[0], "main_a-2b.pdf"),
             "xml": os.path.join(
-                self.file_path, article.find("files-info/ml/pathname").text
+                self.file_path, article.find("journal-item/files-info/ml/pathname").text
             ),
         }
diff --git a/dags/elsevier/parser.py b/dags/elsevier/parser.py
@@ -14,9 +14,45 @@ class ElsevierParser(IParser):
     def __init__(self) -> None:
         self.dois = None
         self.year = None
-        self.journal_doctype = None
-        self.collaborations = []
         self.logger = get_logger().bind(class_name=type(self).__name__)
+        self.article_type_mapping = {
+            "article": "article",
+            "sco": "article",
+            "fla": "article",
+            "abs": "article",
+            "rev": "article",
+            "add": "addendum",
+            "edb": "editorial",
+            "edi": "editorial",
+            "err": "erratum",
+            "ret": "retraction",
+            "rem": "retraction",
+            "adv": "other",
+            "ann": "other",
+            "brv": "other",
+            "cal": "other",
+            "chp": "other",
+            "cnf": "other",
+            "con": "other",
+            "cop": "other",
+            "cor": "other",
+            "crp": "other",
+            "dis": "other",
+            "dup": "other",
+            "exm": "other",
+            "ind": "other",
+            "lit": "other",
+            "mis": "other",
+            "nws": "other",
+            "ocn": "other",
+            "pgl": "other",
+            "pnt": "other",
+            "prp": "other",
+            "prv": "other",
+            "pub": "other",
+            "req": "other",
+            "ssu": "other",
+        }
         extractors = [
             CustomExtractor(
                 destination="dois",
@@ -54,9 +90,9 @@ def __init__(self) -> None:
                 destination="copyright_statement",
                 source="item-info/copyright",
             ),
-            TextExtractor(
-                destination="journal_artid",
-                source="item-info/aid",
+            CustomExtractor(
+                destination="journal_doctype",
+                extraction_function=self._get_journal_doctype,
             ),
         ]
         super().__init__(extractors)
@@ -170,3 +206,19 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]):
                         "value": affiliation_value,
                     }
                 )
+
+    def _get_journal_doctype(self, article):
+        node = article.find(".")
+        value = node.get("docsubtype")
+        if not value:
+            self.logger.error("Article-type is not found in XML", dois=self.dois)
+            return None
+        try:
+            self.journal_doctype = self.article_type_mapping[value]
+            return self.journal_doctype
+        except KeyError:
+            self.logger.error(
+                "Unmapped article type", dois=self.dois, article_type=value
+            )
+        except Exception:
+            self.logger.error("Unknown error", dois=self.dois)