From f94b823db1139bcbb005b221364473d2d38a5216 Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Tue, 26 Sep 2023 14:13:41 +0200 Subject: [PATCH] hinawi final fields --- .../Hindawi/hindawi_fields_mapping.md | 122 ++++++++++++++++++ documentation/IOP/iop_fields_mapping.md | 4 +- 2 files changed, 124 insertions(+), 2 deletions(-) diff --git a/documentation/Hindawi/hindawi_fields_mapping.md b/documentation/Hindawi/hindawi_fields_mapping.md index aecfd240..d36a5ce7 100644 --- a/documentation/Hindawi/hindawi_fields_mapping.md +++ b/documentation/Hindawi/hindawi_fields_mapping.md @@ -1,3 +1,117 @@ +# [Final fields](#final_fields) + +| Field | Processed | Subfield | Subsubfield | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | +| dois | generic_parsing : [33] | value | | +| arxiv_eprints | enricher : [67] | value | | +| | | categories | | +| page_nr | parsing : [6] | | | +| authors | parsing : [6]
generic_parsing : [22] | surname | | +| | | given_names | | +| | | full_name | | +| | | affiliations | country | +| | | | institution | +| collections | parsing [12] | | | +| license | parsing [11] | url | | +| | | license | | +| publication_info | generic_parsing : [40]] | journal_title | | +| | | journal_volume | | +| | | year | | +| abstracts | enhancer : [46] | value | | +| acquisition_source | enhancer : [49] | source | | +| | | method | | +| | | date | | +| copyright | enhancer : [50] | year | | +| | | statement | | +| imprints | enhancer : [51] | date | | +| | | publisher | | +| record_creation_date | enhancer : [50] | | | +| titles | enhancer : [51] | title | | +| | | source | | +| $schema | enricher : [66] | | | + + +# [Enricher](#enricher) +| | | | +| ------------------------------ | ------------- | ----------------------------------------------------- | +| Reference | Field | Enricher | +| [66] | schema | \_get_schema | +| [67] | arxiv_eprints | \_get_arxiv_eprints | + +### [\_get_schema](#_get_schema) + +| Reference | Subfield | Value | Default value | +| --------- | -------- | -------------------------------------------------------------------------------- | ------------- | +| | | os.getenv("REPO_URL", "http://repo.qa.scoap3.org/schemas/hep.json") | | + +### [\_get_arxiv_eprints](#_get_arxiv_eprints) + +| Reference | Subfield | Processing | +| ------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [68] | categories | 1. Need to take arxiv id value from arxiv_eprints.value
2. Make a request to arxiv API: f'https://export.arxiv.org/api/query?search_query=id:{arxiv_eprints.value}'if:{ arxiv_eprints.value}'
3. From XML response, take the categories by path: arxiv:primary_category and rest of the categories by path: /w3:category.
xml_namespaces = { "arxiv": "http://arxiv.org/schemas/atom", "w3": "http://www.w3.org/2005/Atom", } | +| [69] | value | Cleans blank space | + +# [Enhancer](#enhancer) + +| Reference | Field | Enhancer | +| ------------------------------ | -------------------- | ---------------------------------------------------------------------------------- | +| [46] | abstracts | \_\_construct_abstracts | +| [47] | acquisition_source | \_\_construct_acquisition_source | +| [48] | copyright | \_\_construct_copyright | +| [49] | imprints | \_\_construct_imprints | +| [50] | record_creation_date | \_\_construct_record_creation_date | +| [51] | titles | \_\_construct_titles | +| [52] | | \_\_remove_country | + +### [\_\_construct_abstracts](#__construct_abstracts) + +| Reference | Subfield | Value | +| ------------------------------ | -------- | ------------------------------------------------------------------------------ | +| [53] | value | Take value from generic parsing abstract [23] | +| [54] | source | Constant: Hindawi | + +### [\_\_construct_acquisition_source](#__construct_acquisition_source) + +| Reference | Subfield | Value | +| ------------------------------ | -------- | ------------------------------------------------ | +| [55] | source | Constant: Hindawi | +| [56] | method | Constant: Hindawi | +| [57] | date | datetime.datetime.now().isoformat() | + +### [\_\_construct_copyright](#__construct_copyright) + +| Reference | Subfield | Value | +| ------------------------------ | --------- | ----------------------------------------------------------------------------------------- | +| [58] | year | Take value from parsing copyright_year [10] | +| [59] | statement | Take value from parsing copyright_statement [9] | + +### [\_\_construct_imprints](#__construct_imprints) + +| Reference | Subfield | Value | +| ------------------------------ | --------- | ---------------------------------------------------------------------------------------------------- | +| [60] | date | Take value from generic_parsing date_published [29] | +| [61] | publisher | constant: IOP | + +### [\_\_construct_record_creation_date](#__construct_record_creation_date) + +| Reference | Subfield | Value | +| ------------------------------ | -------- | ------------------------------------------------ | +| [62] | | datetime.datetime.now().isoformat() | + +### [\_\_construct_titles](#__construct_titles) + +| Reference | Subfield | Value | +| ------------------------------ | -------- | ---------------------------------------------------------------------------------------------------- | +| [63] | title | removed fn tags. `FN_REGEX = re.compile(r"")`
`FN_REGEX.sub("", item.pop("title", "")).strip()` | +| [64] | source | constant: IOP | + +### [\_\_remove_country](#__remove_country) + +| | | | | +| ------------------------------ | ---------------------------------------------------------------------------------------- | ----- | -------------------------------------------- | +| Reference | Field | Value | Processing | +| [65] | from parsed affiliation country [55] | | removes county if the value has: | + # [Generic parsing](#generic_parsing) | Reference | Field | Subfield | Processing | Default value | @@ -80,3 +194,11 @@ |-----------|---------|---------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [20] | url | License urls: "ns0:metadata/ns1:record/ns0:datafield/[@tag='540']/ns0:subfield/[@code='u']" | | | [21] | license | license text = ns0:metadata/ns1:record/ns0:datafield/[@tag='540']/ns0:subfield/[@code='a'] | url_parts = license_url.text.split("/")
clean_url_parts = list(filter(bool, url_parts))
version = clean_url_parts.pop()
license_type = clean_url_parts.pop()
f"CC-{license_type}-{version}"
| + +### [_get_affiliations](#_get_affiliations) + +| Reference | Field | Source | Parsing | +|-----------|--------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------| +| [53] | value | ns0:metadata/ns1:record/ns0:datafield/[@tag='100']/ns0:subfield[@code='u']
ns0:metadata/ns1:record/ns0:datafield/[@tag='700']/ns0:subfield[@code='u'] | | +| [54] | organization | same as value: ns0:metadata/ns1:record/ns0:datafield/[@tag='100']/ns0:subfield[@code='u']
ns0:metadata/ns1:record/ns0:datafield/[@tag='700']/ns0:subfield[@code='u'] | takes the string before the last comma | +| [55] | country | same as value: ns0:metadata/ns1:record/ns0:datafield/[@tag='100']/ns0:subfield[@code='u']
ns0:metadata/ns1:record/ns0:datafield/[@tag='700']/ns0:subfield[@code='u'] | takes the last string after comma, which starts with a capital letter | diff --git a/documentation/IOP/iop_fields_mapping.md b/documentation/IOP/iop_fields_mapping.md index 9b3a6476..f713d35b 100644 --- a/documentation/IOP/iop_fields_mapping.md +++ b/documentation/IOP/iop_fields_mapping.md @@ -80,8 +80,8 @@ | Reference | Subfield | Value | | ------------------------------ | -------- | ------------------------------------------------ | -| [57] | source | Constant: Springer | -| [58] | method | Constant: Springer | +| [57] | source | Constant: IOP | +| [58] | method | Constant: IOP | | [59] | date | datetime.datetime.now().isoformat() | ### [\_\_construct_copyright](#__construct_copyright)