cern-sis · drjova · Dec 11, 2023 · Dec 11, 2023
diff --git a/dags/elsevier/metadata_parser.py b/dags/elsevier/metadata_parser.py
@@ -11,8 +11,6 @@ class ElsevierMetadataParser(IParser):
     def __init__(self, file_path) -> None:
         self.file_path = file_path
         self.year = None
-        self.journal_doctype = None
-        self.collaborations = []
         self.logger = get_logger().bind(class_name=type(self).__name__)
         self.extractors = [
             CustomExtractor(
@@ -25,6 +23,11 @@ def __init__(self, file_path) -> None:
                 extraction_function=self._get_journal_title,
                 required=True,
             ),
+            CustomExtractor(
+                destination="journal_artid",
+                extraction_function=self._get_journal_aid,
+                required=True,
+            ),
             CustomExtractor(
                 destination="date_published",
                 extraction_function=self._get_published_date,
@@ -61,6 +64,7 @@ def parse(self, article):
                 for extractor in self.extractors
                 if (value := extractor.extract(journal_issue)) is not None
             }
+            extracted_value["journal_volume"] = self._get_journal_volume(article.find("dataset-content"))
             parsed_articles.append(self._generic_parsing(extracted_value))
         return parsed_articles
 
@@ -101,12 +105,37 @@ def _get_journal_title(self, article):
         journal_title = extract_text(
             article=article,
             path="journal-item-unique-ids/jid-aid/jid",
-            field_name="collections",
+            field_name="journal_title",
             dois=self.dois,
         )
         self.journal_title = journal_title
         return journal_title
 
+    def _get_journal_aid(self, article):
+        journal_aid = extract_text(
+            article=article,
+            path="journal-item-unique-ids/jid-aid/aid",
+            field_name="journal_aid",
+            dois=self.dois,
+        )
+        return journal_aid
+
+    def _get_journal_volume(self, article):
+        vol_first = extract_text(
+            article=article,
+            path="journal-issue/journal-issue-properties/volume-issue-number/vol-first",
+            field_name="volume_vol_first",
+            dois=None
+        )
+        suppl = extract_text(
+            article=article,
+            path="journal-issue/journal-issue-properties/volume-issue-number/suppl",
+            field_name="volume_suppl",
+            dois=None
+        )
+
+        return f"{vol_first} {suppl}"
+
     def _get_collections(self, article):
         return [self.journal_title]
 
@@ -123,7 +152,8 @@ def _get_local_files(self, article):
             self.file_path = self.file_path.replace("raw/", "")
 
         pdf_file_path = os.path.join(
-            self.file_path, article.find("files-info/web-pdf/pathname").text
+            self.file_path,
+            article.find("files-info/web-pdf/pathname").text,
         )
         return {
             "pdf": pdf_file_path,

diff --git a/dags/elsevier/parser.py b/dags/elsevier/parser.py
@@ -14,9 +14,45 @@ class ElsevierParser(IParser):
     def __init__(self) -> None:
         self.dois = None
         self.year = None
-        self.journal_doctype = None
-        self.collaborations = []
         self.logger = get_logger().bind(class_name=type(self).__name__)
+        self.article_type_mapping = {
+            "article": "article",
+            "sco": "article",
+            "fla": "article",
+            "abs": "article",
+            "rev": "article",
+            "add": "addendum",
+            "edb": "editorial",
+            "edi": "editorial",
+            "err": "erratum",
+            "ret": "retraction",
+            "rem": "retraction",
+            "adv": "other",
+            "ann": "other",
+            "brv": "other",
+            "cal": "other",
+            "chp": "other",
+            "cnf": "other",
+            "con": "other",
+            "cop": "other",
+            "cor": "other",
+            "crp": "other",
+            "dis": "other",
+            "dup": "other",
+            "exm": "other",
+            "ind": "other",
+            "lit": "other",
+            "mis": "other",
+            "nws": "other",
+            "ocn": "other",
+            "pgl": "other",
+            "pnt": "other",
+            "prp": "other",
+            "prv": "other",
+            "pub": "other",
+            "req": "other",
+            "ssu": "other",
+        }
         extractors = [
             CustomExtractor(
                 destination="dois",
@@ -54,9 +90,9 @@ def __init__(self) -> None:
                 destination="copyright_statement",
                 source="item-info/copyright",
             ),
-            TextExtractor(
-                destination="journal_artid",
-                source="item-info/aid",
+            CustomExtractor(
+                destination="journal_doctype",
+                extraction_function=self._get_journal_doctype,
             ),
         ]
         super().__init__(extractors)
@@ -170,3 +206,19 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]):
                         "value": affiliation_value,
                     }
                 )
+
+    def _get_journal_doctype(self, article):
+        node = article.find(".")
+        value = node.get("docsubtype")
+        if not value:
+            self.logger.error("Article-type is not found in XML", dois=self.dois)
+            return None
+        try:
+            self.journal_doctype = self.article_type_mapping[value]
+            return self.journal_doctype
+        except KeyError:
+            self.logger.error(
+                "Unmapped article type", dois=self.dois, article_type=value
+            )
+        except Exception:
+            self.logger.error("Unknown error", dois=self.dois)