From 150dd5d945b11636b6f705a9b52ccdf67de5a5d5 Mon Sep 17 00:00:00 2001 From: ErnestaP <ernesta.petraityte@yahoo.com> Date: Wed, 10 Jul 2024 13:55:21 +0200 Subject: [PATCH] HindawiAPIClient:added curl headers --- dags/hindawi/hindawi_api_client.py | 5 +++++ dags/hindawi/hindawi_pull_api.py | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/dags/hindawi/hindawi_api_client.py b/dags/hindawi/hindawi_api_client.py index d5de2f8d..67003043 100644 --- a/dags/hindawi/hindawi_api_client.py +++ b/dags/hindawi/hindawi_api_client.py @@ -19,6 +19,10 @@ def __init__( "HINDAWI_API_FILES_URL", "http://downloads.hindawi.com" ) self.logger = get_logger().bind(class_name=type(self).__name__) + self.headers = { + "Accept": "application/xml", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0" + } def get_articles_metadata(self, parameters, doi=None): path_segments = ["oai-pmh", "oai.aspx"] @@ -27,6 +31,7 @@ def get_articles_metadata(self, parameters, doi=None): path_segments.append(doi) request = Request( base_url=self.base_url, + headers=self.headers, path_segments=path_segments, parameters=parameters, ) diff --git a/dags/hindawi/hindawi_pull_api.py b/dags/hindawi/hindawi_pull_api.py index 84b7a22f..75ee10f2 100644 --- a/dags/hindawi/hindawi_pull_api.py +++ b/dags/hindawi/hindawi_pull_api.py @@ -17,17 +17,17 @@ ) def hindawi_pull_api(): @task() - def set_fetching_intervals(repo= HindawiRepository(), **kwargs): + def set_fetching_intervals(repo=HindawiRepository(), **kwargs): return set_harvesting_interval(repo=repo, **kwargs) @task() - def save_xml_in_s3(dates: dict, repo= HindawiRepository(), **kwargs): + def save_xml_in_s3(dates: dict, repo=HindawiRepository(), **kwargs): record = kwargs["params"]["record_doi"] parameters = HindawiParams( from_date=dates["from_date"], until_date=dates["until_date"], record=record ).get_params() rest_api = HindawiApiClient( - base_url=os.getenv("HINDAWI_API_BASE_URL", "https://www.hindawi.com") + base_url=os.getenv("HINDAWI_API_BASE_URL", "https://oaipmh.hindawi.com") ) articles_metadata = rest_api.get_articles_metadata(parameters) if not articles_metadata: @@ -36,7 +36,7 @@ def save_xml_in_s3(dates: dict, repo= HindawiRepository(), **kwargs): return save_file_in_s3(data=articles_metadata, repo=repo) @task() - def trigger_files_processing(key, repo= HindawiRepository()): + def trigger_files_processing(key, repo=HindawiRepository()): if not key: logging.warning("No new files were downloaded to s3") return