diff --git a/dags/elsevier/metadata_parser.py b/dags/elsevier/metadata_parser.py index 7d40c8ad..74bd73f9 100644 --- a/dags/elsevier/metadata_parser.py +++ b/dags/elsevier/metadata_parser.py @@ -11,8 +11,6 @@ class ElsevierMetadataParser(IParser): def __init__(self, file_path) -> None: self.file_path = file_path self.year = None - self.journal_doctype = None - self.collaborations = [] self.logger = get_logger().bind(class_name=type(self).__name__) self.extractors = [ CustomExtractor( @@ -25,6 +23,11 @@ def __init__(self, file_path) -> None: extraction_function=self._get_journal_title, required=True, ), + CustomExtractor( + destination="journal_artid", + extraction_function=self._get_journal_aid, + required=True, + ), CustomExtractor( destination="date_published", extraction_function=self._get_published_date, @@ -61,6 +64,7 @@ def parse(self, article): for extractor in self.extractors if (value := extractor.extract(journal_issue)) is not None } + extracted_value["journal_volume"] = self._get_journal_volume(article.find("dataset-content")) parsed_articles.append(self._generic_parsing(extracted_value)) return parsed_articles @@ -101,12 +105,37 @@ def _get_journal_title(self, article): journal_title = extract_text( article=article, path="journal-item-unique-ids/jid-aid/jid", - field_name="collections", + field_name="journal_title", dois=self.dois, ) self.journal_title = journal_title return journal_title + def _get_journal_aid(self, article): + journal_aid = extract_text( + article=article, + path="journal-item-unique-ids/jid-aid/aid", + field_name="journal_aid", + dois=self.dois, + ) + return journal_aid + + def _get_journal_volume(self, article): + vol_first = extract_text( + article=article, + path="journal-issue/journal-issue-properties/volume-issue-number/vol-first", + field_name="volume_vol_first", + dois=None + ) + suppl = extract_text( + article=article, + path="journal-issue/journal-issue-properties/volume-issue-number/suppl", + field_name="volume_suppl", + dois=None + ) + + return f"{vol_first} {suppl}" + def _get_collections(self, article): return [self.journal_title] @@ -123,7 +152,8 @@ def _get_local_files(self, article): self.file_path = self.file_path.replace("raw/", "") pdf_file_path = os.path.join( - self.file_path, article.find("files-info/web-pdf/pathname").text + self.file_path, + article.find("files-info/web-pdf/pathname").text, ) return { "pdf": pdf_file_path, diff --git a/dags/elsevier/parser.py b/dags/elsevier/parser.py index 53742aed..07a3e2df 100644 --- a/dags/elsevier/parser.py +++ b/dags/elsevier/parser.py @@ -14,9 +14,45 @@ class ElsevierParser(IParser): def __init__(self) -> None: self.dois = None self.year = None - self.journal_doctype = None - self.collaborations = [] self.logger = get_logger().bind(class_name=type(self).__name__) + self.article_type_mapping = { + "article": "article", + "sco": "article", + "fla": "article", + "abs": "article", + "rev": "article", + "add": "addendum", + "edb": "editorial", + "edi": "editorial", + "err": "erratum", + "ret": "retraction", + "rem": "retraction", + "adv": "other", + "ann": "other", + "brv": "other", + "cal": "other", + "chp": "other", + "cnf": "other", + "con": "other", + "cop": "other", + "cor": "other", + "crp": "other", + "dis": "other", + "dup": "other", + "exm": "other", + "ind": "other", + "lit": "other", + "mis": "other", + "nws": "other", + "ocn": "other", + "pgl": "other", + "pnt": "other", + "prp": "other", + "prv": "other", + "pub": "other", + "req": "other", + "ssu": "other", + } extractors = [ CustomExtractor( destination="dois", @@ -54,9 +90,9 @@ def __init__(self) -> None: destination="copyright_statement", source="item-info/copyright", ), - TextExtractor( - destination="journal_artid", - source="item-info/aid", + CustomExtractor( + destination="journal_doctype", + extraction_function=self._get_journal_doctype, ), ] super().__init__(extractors) @@ -170,3 +206,19 @@ def _get_affiliation(self, article, ref_id="", affiliations=[]): "value": affiliation_value, } ) + + def _get_journal_doctype(self, article): + node = article.find(".") + value = node.get("docsubtype") + if not value: + self.logger.error("Article-type is not found in XML", dois=self.dois) + return None + try: + self.journal_doctype = self.article_type_mapping[value] + return self.journal_doctype + except KeyError: + self.logger.error( + "Unmapped article type", dois=self.dois, article_type=value + ) + except Exception: + self.logger.error("Unknown error", dois=self.dois) diff --git a/tests/units/elsevier/data/dataset_bfrqq.xml b/tests/units/elsevier/data/dataset_bfrqq.xml new file mode 100644 index 00000000..f49e4d73 --- /dev/null +++ b/tests/units/elsevier/data/dataset_bfrqq.xml @@ -0,0 +1,1807 @@ + + + + + CERN + CERNAB00000010631 + 2023-09-06T11:42:34 + + + LOAD + CAP + + + + + H200.37 + H200 + + + S0370-2693(23)X0009-4 + + + PLB + 0370-2693 + + 845 + C + + Physics Letters B + + + + 03702693/v845sC/issue.xml + 1 + MAIN + SI 5.6.0 + + + + + + S250.1 + S250 + + + S0370-2693(23)00444-6 + 10.1016/j.physletb.2023.138110 + + PLB + 0370-2693 + 138110 + 138110 + + + + SCO + NON-CRC + 2023-08-14T14:22:58Z + + + + 03702693/v845sC/S0370269323004446/main.xml + 608560 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004446/main.pdf + 4248308 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00474-4 + 10.1016/j.physletb.2023.138140 + + PLB + 0370-2693 + 138140 + 138140 + + + + SCO + NON-CRC + 2023-08-24T14:38:06Z + + + + 03702693/v845sC/S0370269323004744/main.xml + 277260 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004744/main.pdf + 866360 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00475-6 + 10.1016/j.physletb.2023.138141 + + PLB + 0370-2693 + 138141 + 138141 + + + + SCO + NON-CRC + 2023-08-24T14:38:46Z + + + + 03702693/v845sC/S0370269323004756/main.xml + 186657 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004756/main.pdf + 255072 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00476-8 + 10.1016/j.physletb.2023.138142 + + PLB + 0370-2693 + 138142 + 138142 + + + + SCO + NON-CRC + 2023-08-23T14:00:03Z + + + + 03702693/v845sC/S0370269323004768/main.xml + 236598 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004768/main.pdf + 422017 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00064-3 + 10.1016/j.physletb.2023.137730 + + PLB + 0370-2693 + 137730 + 137730 + + + + SCO + NON-CRC + 2023-02-04T00:19:29Z + + + + 03702693/v845sC/S0370269323000643/main.xml + 488606 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323000643/main.pdf + 2378828 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00457-4 + 10.1016/j.physletb.2023.138123 + + PLB + 0370-2693 + 138123 + 138123 + + + + SCO + NON-CRC + 2023-08-18T07:37:56Z + + + + 03702693/v845sC/S0370269323004574/main.xml + 319635 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004574/main.pdf + 845130 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00471-9 + 10.1016/j.physletb.2023.138137 + + PLB + 0370-2693 + 138137 + 138137 + + + + SCO + NON-CRC + 2023-08-22T14:36:25Z + + + + 03702693/v845sC/S0370269323004719/main.xml + 296097 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004719/main.pdf + 1374631 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00492-6 + 10.1016/j.physletb.2023.138158 + + PLB + 0370-2693 + 138158 + 138158 + + + + SCO + NON-CRC + 2023-08-30T14:56:06Z + + + + 03702693/v845sC/S0370269323004926/main.xml + 426948 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004926/main.pdf + 462000 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00454-9 + 10.1016/j.physletb.2023.138120 + + PLB + 0370-2693 + 138120 + 138120 + + + + SCO + NON-CRC + 2023-08-11T19:05:57Z + + + + 03702693/v845sC/S0370269323004549/main.xml + 135057 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004549/main.pdf + 930907 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00482-3 + 10.1016/j.physletb.2023.138148 + + PLB + 0370-2693 + 138148 + 138148 + + + + SCO + NON-CRC + 2023-08-28T14:42:58Z + + + + 03702693/v845sC/S0370269323004823/main.xml + 156996 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004823/main.pdf + 1472575 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00481-1 + 10.1016/j.physletb.2023.138147 + + PLB + 0370-2693 + 138147 + 138147 + + + + SCO + NON-CRC + 2023-08-25T14:08:05Z + + + + 03702693/v845sC/S0370269323004811/main.xml + 168822 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004811/main.pdf + 369096 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00495-1 + 10.1016/j.physletb.2023.138161 + + PLB + 0370-2693 + 138161 + 138161 + + + + SCO + NON-CRC + 2023-08-31T13:19:27Z + + + + 03702693/v845sC/S0370269323004951/main.xml + 251459 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004951/main.pdf + 1650836 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00465-3 + 10.1016/j.physletb.2023.138131 + + PLB + 0370-2693 + 138131 + 138131 + + + + SCO + NON-CRC + 2023-08-21T14:44:11Z + + + + 03702693/v845sC/S0370269323004653/main.xml + 200027 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004653/main.pdf + 340475 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00468-9 + 10.1016/j.physletb.2023.138134 + + PLB + 0370-2693 + 138134 + 138134 + + + + SCO + NON-CRC + 2023-08-21T14:08:21Z + + + + 03702693/v845sC/S0370269323004689/main.xml + 167049 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004689/main.pdf + 556183 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00458-6 + 10.1016/j.physletb.2023.138124 + + PLB + 0370-2693 + 138124 + 138124 + + + + SCO + NON-CRC + 2023-08-17T12:19:40Z + + + + 03702693/v845sC/S0370269323004586/main.xml + 166865 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004586/main.pdf + 550460 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00479-3 + 10.1016/j.physletb.2023.138145 + + PLB + 0370-2693 + 138145 + 138145 + + + + SCO + NON-CRC + 2023-08-28T14:51:59Z + + + + 03702693/v845sC/S0370269323004793/main.xml + 627525 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004793/main.pdf + 2039464 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00469-0 + 10.1016/j.physletb.2023.138135 + + PLB + 0370-2693 + 138135 + 138135 + + + + SCO + NON-CRC + 2023-08-21T14:43:46Z + + + + 03702693/v845sC/S0370269323004690/main.xml + 232266 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004690/main.pdf + 892308 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00461-6 + 10.1016/j.physletb.2023.138127 + + PLB + 0370-2693 + 138127 + 138127 + + + + SCO + NON-CRC + 2023-08-19T05:56:26Z + + + + 03702693/v845sC/S0370269323004616/main.xml + 184735 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004616/main.pdf + 941347 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00478-1 + 10.1016/j.physletb.2023.138144 + + PLB + 0370-2693 + 138144 + 138144 + + + + SCO + NON-CRC + 2023-08-25T14:26:01Z + + + + 03702693/v845sC/S0370269323004781/main.xml + 255178 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004781/main.pdf + 489895 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00455-0 + 10.1016/j.physletb.2023.138121 + + PLB + 0370-2693 + 138121 + 138121 + + + + SCO + NON-CRC + 2023-08-14T14:16:02Z + + + + 03702693/v845sC/S0370269323004550/main.xml + 248909 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004550/main.pdf + 477106 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00466-5 + 10.1016/j.physletb.2023.138132 + + PLB + 0370-2693 + 138132 + 138132 + + + + SCO + NON-CRC + 2023-08-21T14:29:54Z + + + + 03702693/v845sC/S0370269323004665/main.xml + 205110 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004665/main.pdf + 287568 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00483-5 + 10.1016/j.physletb.2023.138149 + + PLB + 0370-2693 + 138149 + 138149 + + + + SCO + NON-CRC + 2023-08-28T15:01:15Z + + + + 03702693/v845sC/S0370269323004835/main.xml + 161160 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004835/main.pdf + 990146 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00487-2 + 10.1016/j.physletb.2023.138153 + + PLB + 0370-2693 + 138153 + 138153 + + + + SCO + NON-CRC + 2023-08-29T14:49:28Z + + + + 03702693/v845sC/S0370269323004872/main.xml + 215318 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004872/main.pdf + 493359 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00462-8 + 10.1016/j.physletb.2023.138128 + + PLB + 0370-2693 + 138128 + 138128 + + + + SCO + NON-CRC + 2023-08-19T00:50:03Z + + + + 03702693/v845sC/S0370269323004628/main.xml + 100385 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004628/main.pdf + 471325 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00463-X + 10.1016/j.physletb.2023.138129 + + PLB + 0370-2693 + 138129 + 138129 + + + + SCO + NON-CRC + 2023-08-21T14:43:51Z + + + + 03702693/v845sC/S037026932300463X/main.xml + 146305 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S037026932300463X/main.pdf + 321321 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00480-X + 10.1016/j.physletb.2023.138146 + + PLB + 0370-2693 + 138146 + 138146 + + + + SCO + NON-CRC + 2023-08-25T14:06:40Z + + + + 03702693/v845sC/S037026932300480X/main.xml + 129708 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S037026932300480X/main.pdf + 473421 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00459-8 + 10.1016/j.physletb.2023.138125 + + PLB + 0370-2693 + 138125 + 138125 + + + + SCO + NON-CRC + 2023-08-18T06:23:03Z + + + + 03702693/v845sC/S0370269323004598/main.xml + 424737 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004598/main.pdf + 728120 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00472-0 + 10.1016/j.physletb.2023.138138 + + PLB + 0370-2693 + 138138 + 138138 + + + + SCO + NON-CRC + 2023-08-23T15:02:01Z + + + + 03702693/v845sC/S0370269323004720/main.xml + 306709 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004720/main.pdf + 366458 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00448-3 + 10.1016/j.physletb.2023.138114 + + PLB + 0370-2693 + 138114 + 138114 + + + + SCO + NON-CRC + 2023-08-08T23:54:42Z + + + + 03702693/v845sC/S0370269323004483/main.xml + 173141 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004483/main.pdf + 576671 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00486-0 + 10.1016/j.physletb.2023.138152 + + PLB + 0370-2693 + 138152 + 138152 + + + + SCO + NON-CRC + 2023-08-29T14:49:13Z + + + + 03702693/v845sC/S0370269323004860/main.xml + 218469 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004860/main.pdf + 328288 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00477-X + 10.1016/j.physletb.2023.138143 + + PLB + 0370-2693 + 138143 + 138143 + + + + SCO + NON-CRC + 2023-08-25T14:58:10Z + + + + 03702693/v845sC/S037026932300477X/main.xml + 240214 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S037026932300477X/main.pdf + 463163 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00491-4 + 10.1016/j.physletb.2023.138157 + + PLB + 0370-2693 + 138157 + 138157 + + + + SCO + NON-CRC + 2023-08-30T14:50:45Z + + + + 03702693/v845sC/S0370269323004914/main.xml + 140103 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004914/main.pdf + 559615 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00467-7 + 10.1016/j.physletb.2023.138133 + + PLB + 0370-2693 + 138133 + 138133 + + + + SCO + NON-CRC + 2023-08-21T14:28:49Z + + + + 03702693/v845sC/S0370269323004677/main.xml + 195152 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004677/main.pdf + 391191 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00464-1 + 10.1016/j.physletb.2023.138130 + + PLB + 0370-2693 + 138130 + 138130 + + + + SCO + NON-CRC + 2023-08-21T14:12:22Z + + + + 03702693/v845sC/S0370269323004641/main.xml + 183712 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004641/main.pdf + 272343 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00473-2 + 10.1016/j.physletb.2023.138139 + + PLB + 0370-2693 + 138139 + 138139 + + + + SCO + NON-CRC + 2023-08-23T14:12:44Z + + + + 03702693/v845sC/S0370269323004732/main.xml + 162437 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004732/main.pdf + 542457 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00456-2 + 10.1016/j.physletb.2023.138122 + + PLB + 0370-2693 + 138122 + 138122 + + + + SCO + NON-CRC + 2023-08-14T14:43:25Z + + + + 03702693/v845sC/S0370269323004562/main.xml + 417078 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004562/main.pdf + 366296 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0370-2693(23)00460-4 + 10.1016/j.physletb.2023.138126 + + PLB + 0370-2693 + 138126 + 138126 + + + + SCO + NON-CRC + 2023-08-23T14:52:19Z + + + + 03702693/v845sC/S0370269323004604/main.xml + 220759 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 03702693/v845sC/S0370269323004604/main.pdf + 567499 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + H200.10 + H200 + + + S0550-3213(23)X0010-8 + + + NUPHB + 0550-3213 + + 995 + C + + Nuclear Physics, Section B + + + + 05503213/v995sC/issue.xml + 1 + MAIN + SI 5.6.0 + + + + + + S250.1 + S250 + + + S0550-3213(23)00273-0 + 10.1016/j.nuclphysb.2023.116344 + + NUPHB + 0550-3213 + 116344 + 116344 + + + + FLA + NON-CRC + 2023-09-04T16:53:44Z + + + + 05503213/v995sC/S0550321323002730/main.xml + 699944 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002730/main.pdf + 366406 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00265-1 + 10.1016/j.nuclphysb.2023.116336 + + NUPHB + 0550-3213 + 116336 + 116336 + + + + FLA + NON-CRC + 2023-08-29T14:52:59Z + + + + 05503213/v995sC/S0550321323002651/main.xml + 728698 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002651/main.pdf + 543093 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00257-2 + 10.1016/j.nuclphysb.2023.116328 + + NUPHB + 0550-3213 + 116328 + 116328 + + + + FLA + NON-CRC + 2023-08-16T14:08:15Z + + + + 05503213/v995sC/S0550321323002572/main.xml + 285913 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002572/main.pdf + 536178 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00259-6 + 10.1016/j.nuclphysb.2023.116330 + + NUPHB + 0550-3213 + 116330 + 116330 + + + + FLA + NON-CRC + 2023-08-19T05:48:15Z + + + + 05503213/v995sC/S0550321323002596/main.xml + 354302 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002596/main.pdf + 676256 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00263-8 + 10.1016/j.nuclphysb.2023.116334 + + NUPHB + 0550-3213 + 116334 + 116334 + + + + FLA + NON-CRC + 2023-08-23T14:04:28Z + + + + 05503213/v995sC/S0550321323002638/main.xml + 297521 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002638/main.pdf + 426492 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00261-4 + 10.1016/j.nuclphysb.2023.116332 + + NUPHB + 0550-3213 + 116332 + 116332 + + + + FLA + NON-CRC + 2023-08-22T14:19:58Z + + + + 05503213/v995sC/S0550321323002614/main.xml + 527883 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002614/main.pdf + 3674678 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00258-4 + 10.1016/j.nuclphysb.2023.116329 + + NUPHB + 0550-3213 + 116329 + 116329 + + + + FLA + NON-CRC + 2023-08-18T07:48:47Z + + + + 05503213/v995sC/S0550321323002584/main.xml + 334627 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002584/main.pdf + 637875 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00251-1 + 10.1016/j.nuclphysb.2023.116322 + + NUPHB + 0550-3213 + 116322 + 116322 + + + + FLA + NON-CRC + 2023-08-08T23:53:12Z + + + + 05503213/v995sC/S0550321323002511/main.xml + 704473 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002511/main.pdf + 451548 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00262-6 + 10.1016/j.nuclphysb.2023.116333 + + NUPHB + 0550-3213 + 116333 + 116333 + + + + FLA + NON-CRC + 2023-08-24T14:53:48Z + + + + 05503213/v995sC/S0550321323002626/main.xml + 230540 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002626/main.pdf + 342191 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + + S250.1 + S250 + + + S0550-3213(23)00260-2 + 10.1016/j.nuclphysb.2023.116331 + + NUPHB + 0550-3213 + 116331 + 116331 + + + + FLA + NON-CRC + 2023-08-23T00:06:47Z + + + + 05503213/v995sC/S0550321323002602/main.xml + 593456 + MAIN + JA 5.6.0 ARTICLE + FULL-TEXT + + + 05503213/v995sC/S0550321323002602/main.pdf + 427243 + MAIN + 1.7 7.0 + DISTILLED OPTIMIZED BOOKMARKED + + + + + diff --git a/tests/units/elsevier/test_elsevier_parser.py b/tests/units/elsevier/test_elsevier_parser.py index 99de31f0..fd044e41 100644 --- a/tests/units/elsevier/test_elsevier_parser.py +++ b/tests/units/elsevier/test_elsevier_parser.py @@ -11586,9 +11586,14 @@ def parsed_articles(parser, articles): id="test_copyright_statement", ), param( - ["137730", "138173", "137649", "138109"], - "journal_artid", - id="test_journal_artid", + [ + "article", + "article", + "article", + "article", + ], + "journal_doctype", + id="test_journal_doctype", ), ], ) diff --git a/tests/units/elsevier/test_metadata_parser.py b/tests/units/elsevier/test_metadata_parser.py index 299d8815..b80588f6 100644 --- a/tests/units/elsevier/test_metadata_parser.py +++ b/tests/units/elsevier/test_metadata_parser.py @@ -38,10 +38,38 @@ def parsed_articles(parser, article): ), param( [ - [{"journal_title": "NUPHB", "year": 2023}], - [{"journal_title": "NUPHB", "year": 2023}], - [{"journal_title": "PLB", "year": 2023}], - [{"journal_title": "PLB", "year": 2023}], + [ + { + "journal_title": "NUPHB", + "year": 2023, + "artid": "116106", + "journal_volume": "None None", + } + ], + [ + { + "journal_title": "NUPHB", + "year": 2023, + "artid": "116107", + "journal_volume": "None None", + } + ], + [ + { + "journal_title": "PLB", + "year": 2023, + "artid": "137730", + "journal_volume": "None None", + } + ], + [ + { + "journal_title": "PLB", + "year": 2023, + "artid": "137751", + "journal_volume": "None None", + } + ], ], "publication_info", id="test_publication_info", @@ -123,3 +151,44 @@ def parsed_articles(parser, article): def test_elsevier_dataset_parsing(parsed_articles, expected, key): for (parsed_article, expected_article) in zip(parsed_articles, expected): assert expected_article == parsed_article[key] + + +@fixture +def articles_with_volume(shared_datadir): + with open(shared_datadir / "dataset_bfrqq.xml") as file: + return parse_without_names_spaces(file.read()) + + +@fixture +@freeze_time("2023-11-02") +def parsed_articles_with_volume(parser, articles_with_volume): + return [article for article in parser.parse(articles_with_volume)] + + +@mark.parametrize( + "expected, key", + [ + param( + [ + [ + { + "journal_title": "PLB", + "journal_volume": "845 C", + "year": 2023, + "artid": "138110", + } + ] + ], + "publication_info", + id="test_publication_info", + ), + ], +) +@freeze_time("2023-11-02") +def test_elsevier_dataset_parsing_with_volume( + parsed_articles_with_volume, expected, key +): + for (parsed_article, expected_article) in zip( + parsed_articles_with_volume, expected + ): + assert expected_article == parsed_article[key]