Skip to content

Commit

Permalink
global: addition of creating or updating articles to backend (#182)
Browse files Browse the repository at this point in the history
  • Loading branch information
drjova authored Nov 29, 2023
1 parent 306a642 commit bf3f848
Show file tree
Hide file tree
Showing 16 changed files with 613 additions and 6 deletions.
6 changes: 6 additions & 0 deletions dags/aps/aps_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from aps.parser import APSParser
from common.enhancer import Enhancer
from common.enricher import Enricher
from common.utils import create_or_update_article
from jsonschema import validate


Expand Down Expand Up @@ -48,10 +49,15 @@ def enrich(enhanced_file):
def validate_record(enriched_file):
return enriched_file and aps_validate_record(enriched_file)

@task()
def create_or_update(enriched_file):
create_or_update_article(enriched_file)

parsed_file = parse()
enhanced_file = enchance(parsed_file)
enriched_file = enrich(enhanced_file)
validate_record(enriched_file)
create_or_update(enriched_file)


dag_for_aps_files_processing = aps_process_file()
22 changes: 22 additions & 0 deletions dags/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from io import StringIO
from stat import S_ISDIR, S_ISREG

import backoff
import requests
from airflow.models.dagrun import DagRun
from airflow.utils.state import DagRunState
from common.constants import (
Expand Down Expand Up @@ -233,3 +235,23 @@ def process_archive(file_bytes, file_name, **kwargs):
return process_zip_file(file_bytes, file_name, **kwargs)
if tarfile.is_tarfile(file_bytes):
return process_tar_file(file_bytes, file_name, **kwargs)


@backoff.on_exception(
backoff.expo,
(requests.exceptions.ConnectionError, requests.exceptions.Timeout),
max_tries=5,
)
def create_or_update_article(data):
backend_url = os.getenv(
"BACKEND_URL", "http://localhost:8000/api/article-workflow-import/"
)
token = os.getenv("BACKEND_TOKEN", "CHANGE_ME")
headers = {"Content-Type": "application/json", "Authorization": f"Token {token}"}
response = requests.post(
f"{backend_url}",
data=json.dumps(data),
headers=headers,
)
response.raise_for_status()
return response.json()
7 changes: 6 additions & 1 deletion dags/elsevier/elsevier_file_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from common.enhancer import Enhancer
from common.enricher import Enricher
from common.exceptions import EmptyOutputFromPreviousTask
from common.utils import parse_without_names_spaces
from common.utils import create_or_update_article, parse_without_names_spaces
from elsevier.parser import ElsevierParser
from jsonschema import validate

Expand Down Expand Up @@ -64,10 +64,15 @@ def validate_record(enriched_file):
return elsevier_validate_record(enriched_file)
raise EmptyOutputFromPreviousTask("enriched_file_with_metadata")

@task()
def create_or_update(enriched_file):
create_or_update_article(enriched_file)

parsed_file = parse()
enhanced_file = enhance(parsed_file)
enriched_file = enrich(enhanced_file)
validate_record(enriched_file)
create_or_update(enriched_file)


Elsevier_file_processing = elsevier_process_file()
6 changes: 6 additions & 0 deletions dags/hindawi/hindawi_file_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from airflow.decorators import dag, task
from common.enhancer import Enhancer
from common.enricher import Enricher
from common.utils import create_or_update_article
from hindawi.parser import HindawiParser
from jsonschema import validate

Expand Down Expand Up @@ -49,10 +50,15 @@ def enrich(enhanced_file):
def validate_record(enriched_file):
return enriched_file and hindawi_validate_record(enriched_file)

@task()
def create_or_update(enriched_file):
create_or_update_article(enriched_file)

parsed_file = parse()
enhanced_file = enchance(parsed_file)
enriched_file = enrich(enhanced_file)
validate_record(enriched_file)
create_or_update(enriched_file)


Hindawi_file_processing = hindawi_file_processing()
6 changes: 6 additions & 0 deletions dags/iop/iop_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from airflow.decorators import dag, task
from common.enhancer import Enhancer
from common.enricher import Enricher
from common.utils import create_or_update_article
from iop.parser import IOPParser
from jsonschema import validate

Expand Down Expand Up @@ -54,10 +55,15 @@ def enrich_file(enhanced_file):
def validate_record(enriched_file):
iop_validate_record(enriched_file)

@task()
def create_or_update(enriched_file):
create_or_update_article(enriched_file)

parsed_file = parse_file()
enhanced_file = enhance_file(parsed_file)
enriched_file = enrich_file(enhanced_file)
validate_record(enriched_file)
create_or_update(enriched_file)


dag_taskflow = iop_process_file()
7 changes: 6 additions & 1 deletion dags/oup/oup_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from airflow.decorators import dag, task
from common.enhancer import Enhancer
from common.enricher import Enricher
from common.utils import parse_without_names_spaces
from common.utils import create_or_update_article, parse_without_names_spaces
from jsonschema import validate
from oup.parser import OUPParser

Expand Down Expand Up @@ -54,10 +54,15 @@ def enrich_file(enhanced_file):
def validate_record(enriched_file):
oup_validate_record(enriched_file)

@task()
def create_or_update(enriched_file):
create_or_update_article(enriched_file)

parsed_file = parse_file()
enhanced_file = enhance_file(parsed_file)
enriched_file = enrich_file(enhanced_file)
validate_record(enriched_file)
create_or_update(enriched_file)


dag_taskflow = oup_process_file()
6 changes: 6 additions & 0 deletions dags/springer/dag_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from airflow.decorators import dag, task
from common.enhancer import Enhancer
from common.enricher import Enricher
from common.utils import create_or_update_article
from jsonschema import validate
from springer.parser import SpringerParser

Expand Down Expand Up @@ -54,10 +55,15 @@ def enrich_file(enhanced_file):
def validate_record(enriched_file):
springer_validate_record(enriched_file)

@task()
def create_or_update(enriched_file):
create_or_update_article(enriched_file)

parsed_file = parse_file()
enhanced_file = enhance_file(parsed_file)
enriched_file = enrich_file(enhanced_file)
validate_record(enriched_file)
create_or_update(enriched_file)


dag_taskflow = springer_process_file()
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,12 @@ def test_case_instance():
@pytest.fixture(scope="session")
def assertListEqual(test_case_instance):
return lambda first, second: test_case_instance.assertCountEqual(first, second)

@pytest.fixture(scope="session")
def vcr_config():
return {
"ignore_localhost": True,
"decode_compressed_response": True,
"filter_headers": ("Authorization", "X-Amz-Date"),
"record_mode": "once",
}
2 changes: 1 addition & 1 deletion tests/integration/aps/test_aps_file_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ def dag():

def test_dag_loaded(dag: DAG):
assert dag is not None
assert len(dag.tasks) == 4
assert len(dag.tasks) == 5
2 changes: 1 addition & 1 deletion tests/integration/iop/test_dag_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def extract_zip_to_article(zip_filename):

def test_dag_loaded(dag: DAG):
assert dag is not None
assert len(dag.tasks) == 4
assert len(dag.tasks) == 5


@pytest.mark.vcr
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/oup/test_oup_dag_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def extract_zip_to_article(zip_filename):

def test_dag_loaded(dag: DAG):
assert dag
assert len(dag.tasks) == 4
assert len(dag.tasks) == 5


@pytest.mark.vcr
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/springer/test_dag_process_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def extract_zip_to_article(zip_filename):

def test_dag_loaded(dag: DAG):
assert dag is not None
assert len(dag.tasks) == 4
assert len(dag.tasks) == 5


@pytest.mark.skip(reason="It does not test anything.")
Expand Down
92 changes: 92 additions & 0 deletions tests/units/common/cassettes/test_create_article.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
interactions:
- request:
body:
'{"_oai": {"updated": "2023-11-14T00:15:03Z", "id": "oai:repo.scoap3.org:81153",
"sets": ["AHEP"]}, "authors": [{"surname": "Mirza", "given_names": "M. Ibrahim",
"raw_name": "Mirza, M. Ibrahim", "affiliations": [{"country": "USA", "value":
"Department of Physics and Astronomy, University of Tennessee, Knoxville, Tennessee
37916, USA"}], "full_name": "Mirza, M. Ibrahim", "orcid": "0009-0002-6581-5721"},
{"surname": "Singh", "given_names": "Jyotsna", "raw_name": "Singh, Jyotsna",
"affiliations": [{"country": "India", "value": "Department of Physics, University
of Lucknow, Lucknow, Uttar Pradesh, India"}], "full_name": "Singh, Jyotsna",
"orcid": "0000-0003-3250-3326"}], "titles": [{"source": "Hindawi", "title":
"Theoretical and Experimental Challenges in the Measurement of Neutrino Mass"}],
"dois": [{"value": "10.1155/2023/8897375"}], "publication_info": [{"page_start":
"8897375", "journal_title": "Advances in High Energy Physics", "year": 2023}],
"$schema": "http://repo.scoap3.org/schemas/hep.json", "acquisition_source":
{"date": "2023-11-14T00:15:01.612737", "source": "Hindawi", "method": "Hindawi",
"submission_number": "d9b644cc828211eeb8f48e9c197d18e3"}, "page_nr": [14], "license":
[{"url": "http://creativecommons.org/licenses/by/3.0/", "license": "CC-BY-3.0"}],
"copyright": [{"statement": "Copyright \u00a9 2023 Jyotsna Singh and M. Ibrahim
Mirza.", "year": "2023"}], "control_number": "81153", "record_creation_date":
"2023-10-27T12:15:05.910972", "collections": [{"primary": "Advances in High
Energy Physics"}], "arxiv_eprints": [{"categories": ["hep-ex", "hep-ph"], "value":
"2305.12654"}], "abstracts": [{"source": "Hindawi", "value": "Neutrino masses
are yet unknown. We discuss the present state of effective electron antineutrino
mass from <math id=\"M1\"><mi>\u03b2</mi></math> decay experiments; effective
Majorana neutrino mass from neutrinoless double-beta decay experiments; neutrino
mass squared differences from neutrino oscillation: solar, atmospheric, reactor,
and accelerator-based experiments; sum of neutrino masses from cosmological
observations. Current experimental challenges in the determination of neutrino
masses are briefly discussed. The main focus is devoted to contemporary experiments."}],
"imprints": [{"date": "2023-10-27", "publisher": "Hindawi"}]}'
headers:
Accept:
- "*/*"
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- "2304"
Content-Type:
- application/json
User-Agent:
- python-requests/2.29.0
method: POST
uri: https://backend.dev.scoap3.org/api/article-workflow-import/
response:
body:
string:
"{\"id\":81153,\"related_files\":[{\"id\":224327,\"file\":\"https://scoap3-dev-backend.s3.cern.ch/media/files/81153/10.1155/2023/8897375.pdf\",\"created\":\"2023-11-27T14:10:10.249047Z\",\"updated\":\"2023-11-27T14:10:10.249062Z\",\"article_id\":81153},{\"id\":224328,\"file\":\"https://scoap3-dev-backend.s3.cern.ch/media/files/81153/10.1155/2023/8897375.a.pdf\",\"created\":\"2023-11-27T14:10:10.254151Z\",\"updated\":\"2023-11-27T14:10:10.254164Z\",\"article_id\":81153},{\"id\":224329,\"file\":\"https://scoap3-dev-backend.s3.cern.ch/media/files/81153/10.1155/2023/8897375.xml\",\"created\":\"2023-11-27T14:10:10.258265Z\",\"updated\":\"2023-11-27T14:10:10.258278Z\",\"article_id\":81153}],\"article_identifiers\":[{\"id\":255244,\"identifier_type\":\"DOI\",\"identifier_value\":\"10.1155/2023/8897375\",\"article_id\":81153},{\"id\":255245,\"identifier_type\":\"arXiv\",\"identifier_value\":\"2305.12654\",\"article_id\":81153}],\"article_arxiv_category\":[{\"id\":181372,\"category\":\"hep-ex\",\"primary\":true,\"article_id\":81153},{\"id\":181373,\"category\":\"hep-ph\",\"primary\":false,\"article_id\":81153}],\"publication_info\":[{\"id\":156787,\"publisher\":\"Hindawi\",\"journal_volume\":\"\",\"journal_title\":\"Advances
in High Energy Physics\",\"journal_issue\":\"\",\"page_start\":\"8897375\",\"page_end\":\"\",\"artid\":\"\",\"volume_year\":\"2023\",\"journal_issue_date\":null,\"article_id\":81153}],\"copyright\":[{\"statement\":\"Copyright
\xA9 2023 Jyotsna Singh and M. Ibrahim Mirza.\",\"holder\":\"\",\"year\":2023}],\"reception_date\":null,\"acceptance_date\":null,\"publication_date\":\"2023-10-27\",\"first_online_date\":null,\"title\":\"Theoretical
and Experimental Challenges in the Measurement of Neutrino Mass\",\"subtitle\":\"\",\"abstract\":\"Neutrino
masses are yet unknown. We discuss the present state of effective electron
antineutrino mass from <math id=\\\"M1\\\"><mi>\u03B2</mi></math> decay experiments;
effective Majorana neutrino mass from neutrinoless double-beta decay experiments;
neutrino mass squared differences from neutrino oscillation: solar, atmospheric,
reactor, and accelerator-based experiments; sum of neutrino masses from cosmological
observations. Current experimental challenges in the determination of neutrino
masses are briefly discussed. The main focus is devoted to contemporary experiments.\",\"_created_at\":\"2023-10-27T12:15:05.910972Z\",\"_updated_at\":\"2023-11-28T15:34:24.552913Z\",\"related_licenses\":[7],\"related_materials\":[]}"
headers:
allow:
- POST, OPTIONS
content-language:
- en
content-length:
- "2309"
content-type:
- application/json
cross-origin-opener-policy:
- same-origin
date:
- Tue, 28 Nov 2023 15:34:24 GMT
referrer-policy:
- same-origin
server:
- gunicorn
strict-transport-security:
- max-age=60; includeSubDomains; preload
vary:
- Accept, Accept-Language, Cookie, origin
x-content-type-options:
- nosniff
x-frame-options:
- DENY
x-proxy-backend:
- scoap3-dev_scoap3-backend-web_http
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
interactions:
- request:
body: "{}"
headers:
Accept:
- "*/*"
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- "2"
Content-Type:
- application/json
User-Agent:
- python-requests/2.29.0
method: POST
uri: https://backend.dev.scoap3.org/api/article-workflow-import/
response:
body:
string: '{"message":"''license''"}'
headers:
allow:
- POST, OPTIONS
content-language:
- en
content-length:
- "23"
content-type:
- application/json
cross-origin-opener-policy:
- same-origin
date:
- Tue, 28 Nov 2023 15:34:25 GMT
referrer-policy:
- same-origin
server:
- gunicorn
strict-transport-security:
- max-age=60; includeSubDomains; preload
vary:
- Accept, Accept-Language, Cookie, origin
x-content-type-options:
- nosniff
x-frame-options:
- DENY
x-proxy-backend:
- scoap3-dev_scoap3-backend-web_http
status:
code: 400
message: Bad Request
version: 1
Loading

0 comments on commit bf3f848

Please sign in to comment.