forked from inspirehep/inspirehep
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
reworked httphooks to make more easy to use * ref: cern-sis/issues-inspire/issues/594
- Loading branch information
Showing
16 changed files
with
512 additions
and
99 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import datetime | ||
import logging | ||
from datetime import timedelta | ||
|
||
from airflow.decorators import dag, task | ||
from airflow.models import Variable | ||
from hooks.generic_http_hook import GenericHttpHook | ||
from hooks.inspirehep.inspire_http_record_management_hook import ( | ||
InspireHTTPRecordManagementHook, | ||
) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@dag( | ||
start_date=datetime.datetime(2024, 11, 28), | ||
schedule="@daily", | ||
catchup=False, | ||
tags=["data"], | ||
) | ||
def data_harvest_dag(): | ||
""" | ||
Initialize a DAG for data harvest workflow. | ||
""" | ||
generic_http_hook = GenericHttpHook(http_conn_id="hepdata_connection") | ||
inspire_http_record_management_hook = InspireHTTPRecordManagementHook() | ||
|
||
data_schema = Variable.get("data_schema") | ||
|
||
@task | ||
def collect_ids(): | ||
from_date = (datetime.datetime.now().date() - timedelta(days=1)).strftime( | ||
"%Y-%m-%d" | ||
) | ||
# http sensor | ||
payload = {"inspire_ids": True, "last_updated": from_date, "sort_by": "latest"} | ||
hepdata_response = generic_http_hook.call_api( | ||
endpoint="/search/ids", method="GET", params=payload | ||
) | ||
|
||
return hepdata_response.json() | ||
|
||
@task(map_index_template="{{id}}") | ||
def download_record(id): | ||
hepdata_response = generic_http_hook.call_api( | ||
endpoint=f"/record/ins{id}?format=json", method="GET" | ||
) | ||
return hepdata_response.json() | ||
|
||
@task.virtualenv(requirements=["inspire-schemas"], system_site_packages=False) | ||
def transform_record(data_schema, record): | ||
from inspire_schemas.builders import LiteratureBuilder | ||
|
||
builder = LiteratureBuilder() | ||
|
||
data = builder.record | ||
data["$schema"] = data_schema | ||
data.update({"_collections": ["Data"]}) # to delete | ||
|
||
return data | ||
|
||
@task | ||
def load_record(record): | ||
inspire_http_record_management_hook.post_record(data=record, pid_type="data") | ||
|
||
ids = collect_ids() | ||
records = download_record.expand(id=ids) | ||
built_records = transform_record.partial(data_schema=data_schema).expand( | ||
record=records | ||
) | ||
load_record.expand(record=built_records) | ||
|
||
|
||
data_harvest_dag() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import logging | ||
|
||
import requests | ||
from airflow.providers.http.hooks.http import HttpHook | ||
from hooks.tenacity_config import tenacity_retry_kwargs | ||
from requests import Response | ||
|
||
logger = logging.getLogger() | ||
|
||
|
||
class GenericHttpHook(HttpHook): | ||
""" | ||
Hook to interact with Inspire API | ||
It overrides the original `run` method in HttpHook so that | ||
we can pass data argument as data, not params | ||
""" | ||
|
||
def __init__(self, http_conn_id, method="GET", headers=None): | ||
self._headers = headers | ||
super().__init__(method=method, http_conn_id=http_conn_id) | ||
|
||
@property | ||
def tenacity_retry_kwargs(self) -> dict: | ||
return tenacity_retry_kwargs() | ||
|
||
@property | ||
def headers(self) -> dict: | ||
return self._headers | ||
|
||
@headers.setter | ||
def headers(self, headers): | ||
self._headers = headers | ||
|
||
def run( | ||
self, | ||
endpoint: str, | ||
method: str = None, | ||
json: dict = None, | ||
data: dict = None, | ||
params: dict = None, | ||
headers: dict = None, | ||
extra_options: dict = None, | ||
): | ||
extra_options = extra_options or {} | ||
method = method or self.method | ||
headers = headers or self.headers | ||
session = self.get_conn(headers) | ||
|
||
if not self.base_url.endswith("/") and not endpoint.startswith("/"): | ||
url = self.base_url + "/" + endpoint | ||
else: | ||
url = self.base_url + endpoint | ||
|
||
req = requests.Request( | ||
method, url, json=json, data=data, params=params, headers=headers | ||
) | ||
|
||
prepped_request = session.prepare_request(req) | ||
self.log.info("Sending '%s' to url: %s", method, url) | ||
return self.run_and_check(session, prepped_request, extra_options) | ||
|
||
def call_api( | ||
self, | ||
method: str, | ||
endpoint: str, | ||
data: dict = None, | ||
params: dict = None, | ||
headers: dict = None, | ||
) -> Response: | ||
return self.run_with_advanced_retry( | ||
_retry_args=self.tenacity_retry_kwargs, | ||
endpoint=endpoint, | ||
headers=headers, | ||
json=data, | ||
params=params, | ||
method=method, | ||
) | ||
|
||
def get_url(self) -> str: | ||
self.get_conn() | ||
return self.base_url |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
apache-airflow==2.9.3 | ||
inspire_utils==3.0.61 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
{ | ||
"backoffice_token": "2e04111a61e8f5ba6ecec52af21bbb9e81732085", | ||
"inspire_token": "CHANGE_ME", | ||
"author_schema": "https://inspirehep.net/schemas/records/authors.json" | ||
"author_schema": "https://inspirehep.net/schemas/records/authors.json", | ||
"data_schema": "https://inspirehep.net/schemas/records/data.json" | ||
} |
Oops, something went wrong.