From d572da4ba6b15eabc5f4e51d02fde2cc0a6987ac Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Tue, 26 Sep 2023 14:50:52 +0200 Subject: [PATCH] Hindawi: documentation * ref: https://github.com/cern-sis/issues-scoap3/issues/177 --- dags/hindawi/parser.py | 2 +- .../Hindawi/hindawi_fields_mapping.md | 205 ++++++++++++++++++ documentation/IOP/iop_fields_mapping.md | 4 +- 3 files changed, 208 insertions(+), 3 deletions(-) create mode 100644 documentation/Hindawi/hindawi_fields_mapping.md diff --git a/dags/hindawi/parser.py b/dags/hindawi/parser.py index 5193b729..5013eb3f 100644 --- a/dags/hindawi/parser.py +++ b/dags/hindawi/parser.py @@ -133,7 +133,7 @@ def _get_arxiv(self, article: ET.Element): self.prefixes, ) if not arxivs: - self.logger.error("No arxiv id_get_copyright_statement found.") + self.logger.error("No arxiv _get_arxiv found.") return None return [ diff --git a/documentation/Hindawi/hindawi_fields_mapping.md b/documentation/Hindawi/hindawi_fields_mapping.md new file mode 100644 index 00000000..1824eaaf --- /dev/null +++ b/documentation/Hindawi/hindawi_fields_mapping.md @@ -0,0 +1,205 @@ +# [Final fields](#final_fields) + +| Field | Processed | Subfield | Subsubfield | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | +| dois | generic_parsing : [33] | value | | +| arxiv_eprints | enricher : [67] | value | | +| | | categories | | +| page_nr | parsing : [6] | | | +| authors | parsing : [6]
generic_parsing : [22] | surname | | +| | | given_names | | +| | | full_name | | +| | | affiliations | country | +| | | | institution | +| collections | parsing [12] | | | +| license | parsing [11] | url | | +| | | license | | +| publication_info | generic_parsing : [40]] | journal_title | | +| | | journal_volume | | +| | | year | | +| abstracts | enhancer : [46] | value | | +| acquisition_source | enhancer : [49] | source | | +| | | method | | +| | | date | | +| copyright | enhancer : [50] | year | | +| | | statement | | +| imprints | enhancer : [51] | date | | +| | | publisher | | +| record_creation_date | enhancer : [50] | | | +| titles | enhancer : [51] | title | | +| | | source | | +| $schema | enricher : [66] | | | + + +# [Enricher](#enricher) +| | | | +| ------------------------------ | ------------- | ----------------------------------------------------- | +| Reference | Field | Enricher | +| [66] | schema | \_get_schema | +| [67] | arxiv_eprints | \_get_arxiv_eprints | + +### [\_get_schema](#_get_schema) + +| Reference | Subfield | Value | Default value | +| --------- | -------- | -------------------------------------------------------------------------------- | ------------- | +| | | os.getenv("REPO_URL", "http://repo.qa.scoap3.org/schemas/hep.json") | | + +### [\_get_arxiv_eprints](#_get_arxiv_eprints) + +| Reference | Subfield | Processing | +| ------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [68] | categories | 1. Need to take arxiv id value from arxiv_eprints.value
2. Make a request to arxiv API: f'https://export.arxiv.org/api/query?search_query=id:{arxiv_eprints.value}'if:{ arxiv_eprints.value}'
3. From XML response, take the categories by path: arxiv:primary_category and rest of the categories by path: /w3:category.
xml_namespaces = { "arxiv": "http://arxiv.org/schemas/atom", "w3": "http://www.w3.org/2005/Atom", } | +| [69] | value | Cleans blank space | + +# [Enhancer](#enhancer) + +| Reference | Field | Enhancer | +| ------------------------------ | -------------------- | ---------------------------------------------------------------------------------- | +| [46] | abstracts | \_\_construct_abstracts | +| [47] | acquisition_source | \_\_construct_acquisition_source | +| [48] | copyright | \_\_construct_copyright | +| [49] | imprints | \_\_construct_imprints | +| [50] | record_creation_date | \_\_construct_record_creation_date | +| [51] | titles | \_\_construct_titles | +| [52] | | \_\_remove_country | + +### [\_\_construct_abstracts](#__construct_abstracts) + +| Reference | Subfield | Value | +| ------------------------------ | -------- | ------------------------------------------------------------------------------ | +| [53] | value | Take value from generic parsing abstract [23] | +| [54] | source | Constant: Hindawi | + +### [\_\_construct_acquisition_source](#__construct_acquisition_source) + +| Reference | Subfield | Value | +| ------------------------------ | -------- | ------------------------------------------------ | +| [55] | source | Constant: Hindawi | +| [56] | method | Constant: Hindawi | +| [57] | date | datetime.datetime.now().isoformat() | + +### [\_\_construct_copyright](#__construct_copyright) + +| Reference | Subfield | Value | +| ------------------------------ | --------- | ----------------------------------------------------------------------------------------- | +| [58] | year | Take value from parsing copyright_year [10] | +| [59] | statement | Take value from parsing copyright_statement [9] | + +### [\_\_construct_imprints](#__construct_imprints) + +| Reference | Subfield | Value | +| ------------------------------ | --------- | ---------------------------------------------------------------------------------------------------- | +| [60] | date | Take value from generic_parsing date_published [29] | +| [61] | publisher | constant: IOP | + +### [\_\_construct_record_creation_date](#__construct_record_creation_date) + +| Reference | Subfield | Value | +| ------------------------------ | -------- | ------------------------------------------------ | +| [62] | | datetime.datetime.now().isoformat() | + +### [\_\_construct_titles](#__construct_titles) + +| Reference | Subfield | Value | +| ------------------------------ | -------- | ---------------------------------------------------------------------------------------------------- | +| [63] | title | removed fn tags. `FN_REGEX = re.compile(r"")`
`FN_REGEX.sub("", item.pop("title", "")).strip()` | +| [64] | source | constant: IOP | + +### [\_\_remove_country](#__remove_country) + +| | | | | +| ------------------------------ | ---------------------------------------------------------------------------------------- | ----- | -------------------------------------------- | +| Reference | Field | Value | Processing | +| [65] | from parsed affiliation country [55] | | removes county if the value has: | + + +# [Generic parsing](#generic_parsing) + +| Reference | Field | Subfield | Processing | Default value | +|-----------|------------------------|----------------------|--------------------------------------------------------------------------------------------------------------------------------------|---------------| +| [22] | authors | surname, given_names | takes authors [2] and splits raw_name: if there is a comma, it means that the surname and given_name are in the second part | | +| [23] | abstract | | takes abstract [3] and cleans white space characters | | +| [24] | collaborations | | NO SUCH A FIELD IN HINDAWI | | +| [25] | title | | takes title [4] and cleans white space characters | | +| [26] | subtitle | | NO SUCH A FIELD IN HINDAWI | | +| [27] | journal_year | | | | +| [28] | preprint_date | | NO SUCH A FIELD IN HINDAWI | | +| [29] | date_published | | takes date_published [5] and forms it f"{tmp_date.year:04d}-{tmp_date.month:02d}-{tmp_date.day:02d}" | | +| [30] | related_article_doi | | NO SUCH A FIELD IN HINDAWI | | +| [31] | free_keywords | | NO SUCH A FIELD IN HINDAWI | | +| [32] | classification_numbers | | NO SUCH A FIELD IN HINDAWI | | +| [33] | dois | | takes dois | | +| [34] | thesis_supervisor | | NO SUCH A FIELD IN HINDAWI | | +| [35] | thesis | | NO SUCH A FIELD IN HINDAWI | | +| [36] | urls | | NO SUCH A FIELD IN HINDAWI | | +| [37] | local_files | | NO SUCH A FIELD IN HINDAWI | | +| [38] | record_creation_date | | NO SUCH A FIELD IN HINDAWI | | +| [39] | control_field | | NO SUCH A FIELD IN HINDAWI | | +| [40] | publication_info | | | | +| [41] | | journal_title | takes journal title [16] | | +| [42] | | journal_volume | takes journal volume [17] | | +| [43] | | journal_year | takes journal year [18] | | +| [44] | | journal_issue | NO SUCH A FIELD IN HINDAWI | | +| [45] | | journal_doctype | NO SUCH A FIELD IN HINDAWI | | + + +# [Parsing](#parsing) + + +| Reference | Field | Source | Parsing | +|-----------|---------------------|-----------------------------------------------------------------------------|------------------------------------------------------------------| +| [1] | dois | ns0:metadata/ns1:record/ns0:datafield/[@tag='024']/ns0:subfield/[@code='a'] | lambda x: [x] | +| [2] | authors | | authors_parsing | +| [3] | abstract | ns0:metadata/ns1:record/ns0:datafield/[@tag='520']/ns0:subfield/[@code='a'] | lambda x: " ".join(x.split()) | +| [4] | title | ns0:metadata/ns1:record/ns0:datafield/[@tag='245']/ns0:subfield/[@code='a'] | lambda x: x | +| [5] | date_published | ns0:metadata/ns1:record/ns0:datafield/[@tag='260']/ns0:subfield/[@code='c'] | lambda x: x | +| [6] | page_nr | ns0:metadata/ns1:record/ns0:datafield/[@tag='300']/ns0:subfield/[@code='a'] | lambda x: [int(x)] | +| [7] | publication_info | | _get_publication_info | +| [8] | arxiv_eprints | | _get_arxiv | +| [9] | copyright_statement | ns0:metadata/ns1:record/ns0:datafield/[@tag='542']/ns0:subfield/[@code='f'] | | +| [10] | copyright_year | ns0:metadata/ns1:record/ns0:datafield/[@tag='542']/ns0:subfield/ | re.search(r"[0-9]{4}", value).group(0) | +| [11] | license | | _get_license | +| [12] | collections | | constant: "Advances in High Energy Physics" | + + +### [authors_parsing](#authors_parsing) + +| Reference | Field | Source | Parsing | +|-----------|--------------|-------------------------------------------------|-----------------------------------------------------| +| [13] | raw_name | ns0:subfield[@code='a'] | lambda x: [x] | +| [14] | affiliations | | _get_affiliations | +| [15] | orcid | ns0:subfield[@code='a']/ns0:subfield[@code='j'] | lambda x: " ".join(x.split()) | + + + +### [_get_publication_info](#_get_publication_info) + +| Reference | Field | Source | Parsing | +|-----------|----------------|-----------------------------------------------------------------------------|---------| +| [16] | journal_title | ns0:metadata/ns1:record/ns0:datafield/[@tag='773']/ns0:subfield/[@code='p'] | | +| [17] | journal_volume | ns0:metadata/ns1:record/ns0:datafield/[@tag='773']/ns0:subfield/[@code='v'] | | +| [18] | journal_year | ns0:metadata/ns1:record/ns0:datafield/[@tag='773']/ns0:subfield/[@code='y'] | | + + + +### [_get_arxiv](#_get_arxiv) + +| Reference | Field | Source | Parsing | +|-----------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------| +| [19] | value | "ns0:metadata/ns1:record/ns0:datafield/[@tag='037']/ns0:subfield/[@code='a']"
if the field above == 'arxiv'
field above:
ns0:metadata/ns1:record/ns0:datafield/[@tag='037']/ns0:subfield/[@code='9'] | Removing "arxiv" from value, leaving just digits | + + + +### [_get_license](#_get_arxiv) +| Reference | Field | Source | Parsing | +|-----------|---------|---------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [20] | url | License urls: "ns0:metadata/ns1:record/ns0:datafield/[@tag='540']/ns0:subfield/[@code='u']" | | +| [21] | license | license text = ns0:metadata/ns1:record/ns0:datafield/[@tag='540']/ns0:subfield/[@code='a'] | url_parts = license_url.text.split("/")
clean_url_parts = list(filter(bool, url_parts))
version = clean_url_parts.pop()
license_type = clean_url_parts.pop()
f"CC-{license_type}-{version}"
| + +### [_get_affiliations](#_get_affiliations) + +| Reference | Field | Source | Parsing | +|-----------|--------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------| +| [53] | value | ns0:metadata/ns1:record/ns0:datafield/[@tag='100']/ns0:subfield[@code='u']
ns0:metadata/ns1:record/ns0:datafield/[@tag='700']/ns0:subfield[@code='u'] | | +| [54] | organization | same as value: ns0:metadata/ns1:record/ns0:datafield/[@tag='100']/ns0:subfield[@code='u']
ns0:metadata/ns1:record/ns0:datafield/[@tag='700']/ns0:subfield[@code='u'] | takes the string before the last comma | +| [55] | country | same as value: ns0:metadata/ns1:record/ns0:datafield/[@tag='100']/ns0:subfield[@code='u']
ns0:metadata/ns1:record/ns0:datafield/[@tag='700']/ns0:subfield[@code='u'] | takes the last string after comma, which starts with a capital letter | diff --git a/documentation/IOP/iop_fields_mapping.md b/documentation/IOP/iop_fields_mapping.md index 9b3a6476..f713d35b 100644 --- a/documentation/IOP/iop_fields_mapping.md +++ b/documentation/IOP/iop_fields_mapping.md @@ -80,8 +80,8 @@ | Reference | Subfield | Value | | ------------------------------ | -------- | ------------------------------------------------ | -| [57] | source | Constant: Springer | -| [58] | method | Constant: Springer | +| [57] | source | Constant: IOP | +| [58] | method | Constant: IOP | | [59] | date | datetime.datetime.now().isoformat() | ### [\_\_construct_copyright](#__construct_copyright)