Skip to content

Commit

Permalink
Merge pull request #2 from Lorenzovagliano/test-branch
Browse files Browse the repository at this point in the history
pre-commit formating changes
  • Loading branch information
Lorenzovagliano authored Oct 22, 2024
2 parents 3323081 + 0c18b8d commit 1dd37aa
Show file tree
Hide file tree
Showing 56 changed files with 440 additions and 442 deletions.
10 changes: 5 additions & 5 deletions dags/aps/aps_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
class APSParams:
def __init__(
self,
from_date= (date.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
until_date= date.today().strftime("%Y-%m-%d"),
date= "modified",
journals= "",
set= "scoap3",
from_date=(date.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
until_date=date.today().strftime("%Y-%m-%d"),
date="modified",
journals="",
set="scoap3",
per_page: int = 100,
):
self.from_date = from_date
Expand Down
6 changes: 3 additions & 3 deletions dags/aps/aps_pull_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
)
def aps_pull_api():
@task()
def set_fetching_intervals(repo = APSRepository(), **kwargs):
def set_fetching_intervals(repo=APSRepository(), **kwargs):
return set_harvesting_interval(repo=repo, **kwargs)

@task()
def save_json_in_s3(dates: dict, repo = APSRepository(), **kwargs):
def save_json_in_s3(dates: dict, repo=APSRepository(), **kwargs):
parameters = APSParams(
from_date=dates["from_date"],
until_date=dates["until_date"],
Expand All @@ -40,7 +40,7 @@ def save_json_in_s3(dates: dict, repo = APSRepository(), **kwargs):
return None

@task()
def trigger_files_processing(key, repo = APSRepository()):
def trigger_files_processing(key, repo=APSRepository()):
if key is None:
logging.warning("No new files were downloaded to s3")
return
Expand Down
13 changes: 7 additions & 6 deletions dags/aps/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,27 @@ def _form_authors(self, article):
if author["type"] == "Person"
]


def extract_organization_and_ror(self, text):
pattern = r'<a href="([^"]+)">(.*?)</a>'

ror_url = None

def replace_and_capture(match):
nonlocal ror_url
ror_url = match.group(1)
return match.group(2)

modified_text = re.sub(pattern, replace_and_capture, text)

return modified_text, ror_url

def _get_affiliations(self, article, affiliationIds):
parsed_affiliations = [
{
"value": affiliation["name"],
"organization": self.extract_organization_and_ror(affiliation["name"])[0],
"organization": self.extract_organization_and_ror(affiliation["name"])[
0
],
"ror": self.extract_organization_and_ror(affiliation["name"])[1],
}
for affiliation in article["affiliations"]
Expand Down
1 change: 0 additions & 1 deletion dags/aps/repository.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import io
import os
from typing import IO

from common.repository import IRepository
from common.s3_service import S3Service
Expand Down
3 changes: 1 addition & 2 deletions dags/clean/cleanup_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pendulum
from airflow.decorators import dag
from airflow.operators.bash import BashOperator
from airflow.operators.bash_operator import BashOperator

AIRFLOW_HOME = os.getenv("AIRFLOW_HOME")
Expand All @@ -14,7 +13,7 @@
def cleanup_logs():
BashOperator(
task_id="cleanup_logs",
bash_command=f"""
bash_command=rf"""
logs_dir="{logs_dir}"
find "$logs_dir" -type d -mtime +30 -exec rm -r {{}} \;
""",
Expand Down
2 changes: 1 addition & 1 deletion dags/common/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def replace_cdata_format(text):
CDATA_PATTERN = re.compile(r"<\?CDATA(.*)\?>")
# pattern = re.compile(r'<\?CDATA\s(.*?)\s\?>', re.DOTALL)

replaced_text = CDATA_PATTERN.sub(r'<![CDATA[ \1 ]]>', text)
replaced_text = CDATA_PATTERN.sub(r"<![CDATA[ \1 ]]>", text)

return replaced_text

Expand Down
42 changes: 22 additions & 20 deletions dags/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
("Brazil", "Brazil"),
("Brasil", "Brazil"),
("Benin", "Benin"),
(u"Bénin", "Benin"),
("Bénin", "Benin"),
("Bulgaria", "Bulgaria"),
("Bosnia and Herzegovina", "Bosnia and Herzegovina"),
("Canada", "Canada"),
Expand Down Expand Up @@ -141,7 +141,7 @@
("Luxembourg", "Luxembourg"),
("Macedonia", "Macedonia"),
("Mexico", "Mexico"),
(u"México", "Mexico"),
("México", "Mexico"),
("Monaco", "Monaco"),
("Montenegro", "Montenegro"),
("Morocco", "Morocco"),
Expand All @@ -161,7 +161,7 @@
("Portugalo", "Portugal"),
("Portugal", "Portugal"),
("P.R.China", "China"),
(u"People’s Republic of China", "China"),
("People’s Republic of China", "China"),
("Republic of Belarus", "Belarus"),
("Republic of Benin", "Benin"),
("Republic of Korea", "South Korea"),
Expand All @@ -181,7 +181,7 @@
("Slovenia", "Slovenia"),
("South Africa", "South Africa"),
("Africa", "South Africa"),
(u"España", "Spain"),
("España", "Spain"),
("Spain", "Spain"),
("Sudan", "Sudan"),
("Sweden", "Sweden"),
Expand Down Expand Up @@ -233,19 +233,21 @@
]
)

INSTITUTIONS_AND_COUNTRIES_MAPPING = OrderedDict([
("INFN", "Italy"),
("European Organization for Nuclear Research", "CERN"),
("Conseil Européen pour la Recherche Nucléaire", "CERN"),
("CERN", "CERN"),
("KEK", "Japan"),
("DESY", "Germany"),
("FERMILAB", "USA"),
("FNAL", "USA"),
("SLACK", "USA"),
("Stanford Linear Accelerator Center", "USA"),
("Joint Institute for Nuclear Research", "JINR"),
("JINR", "JINR"),
("ROC", "Taiwan"),
("R.O.C", "Taiwan"),
])
INSTITUTIONS_AND_COUNTRIES_MAPPING = OrderedDict(
[
("INFN", "Italy"),
("European Organization for Nuclear Research", "CERN"),
("Conseil Européen pour la Recherche Nucléaire", "CERN"),
("CERN", "CERN"),
("KEK", "Japan"),
("DESY", "Germany"),
("FERMILAB", "USA"),
("FNAL", "USA"),
("SLACK", "USA"),
("Stanford Linear Accelerator Center", "USA"),
("Joint Institute for Nuclear Research", "JINR"),
("JINR", "JINR"),
("ROC", "Taiwan"),
("R.O.C", "Taiwan"),
]
)
9 changes: 5 additions & 4 deletions dags/common/enhancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re

from common.constants import FN_REGEX
from common.utils import parse_country_from_value, get_country_ISO_name
from common.utils import get_country_ISO_name, parse_country_from_value


class Enhancer:
Expand Down Expand Up @@ -47,7 +47,7 @@ def __construct_titles(self, item, publisher):
def __construct_authors(self, item):
# add_nations(item)
pattern_for_cern_cooperation_agreement = re.compile(
r'cooperation agreement with cern', re.IGNORECASE
r"cooperation agreement with cern", re.IGNORECASE
)
for author in item.get("authors", []):
for affiliation in author.get("affiliations", []):
Expand All @@ -65,11 +65,12 @@ def __construct_authors(self, item):
affiliation["country"] = _parsed_country

if affiliation.get("country"):
affiliation["country"] = get_country_ISO_name(affiliation["country"])
affiliation["country"] = get_country_ISO_name(
affiliation["country"]
)

return item


def __call__(self, publisher, item):
creation_date = datetime.datetime.now().isoformat()
item_copy = item.copy()
Expand Down
8 changes: 4 additions & 4 deletions dags/common/parsing/xml_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(
extra_function=lambda s: s,
prefixes=None,
all_content_between_tags=False,
remove_tags=False
remove_tags=False,
):
super().__init__(destination)

Expand Down Expand Up @@ -92,7 +92,7 @@ def __init__(
default_value=None,
required=False,
extra_function=lambda x: x,
) :
):
super().__init__(destination)
self.destination = destination
self.source = source
Expand Down Expand Up @@ -132,7 +132,7 @@ def extract(self, article):
class CustomExtractor(IExtractor):
def __init__(
self, destination, extraction_function, required=False, default_value=None
) :
):
super().__init__(destination)
self.destination = destination
self.extraction_function = extraction_function
Expand All @@ -154,7 +154,7 @@ def __init__(
destination,
value,
required=False,
) :
):
super().__init__(destination)
self.destination = destination
self.required = required
Expand Down
16 changes: 8 additions & 8 deletions dags/common/scoap3_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,19 @@

logger = get_logger()

FILE_EXTENSIONS = {
"pdf": ".pdf",
"xml": ".xml",
"pdfa": ".pdf"
}
FILE_EXTENSIONS = {"pdf": ".pdf", "xml": ".xml", "pdfa": ".pdf"}


def update_filename_extension(filename, type):
extension = FILE_EXTENSIONS.get(type, "")
if filename.endswith(extension):
return filename
elif extension:
if type == "pdfa":
extension = f".a-2b.pdf"
extension = ".a-2b.pdf"
return f"{filename}{extension}"


class Scoap3Repository(IRepository):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -55,7 +53,7 @@ def copy_file(self, source_bucket, source_key, prefix=None, type=None):
"source_key": source_key,
},
"MetadataDirective": "REPLACE",
"ACL": "public-read"
"ACL": "public-read",
},
)
logger.info(
Expand All @@ -67,7 +65,9 @@ def copy_files(self, bucket, files, prefix=None):
copied_files = {}
for type, path in files.items():
try:
copied_files[type] = self.copy_file(bucket, path, prefix=prefix, type=type)
copied_files[type] = self.copy_file(
bucket, path, prefix=prefix, type=type
)
except Exception as e:
logger.error("Failed to copy file.", error=str(e), type=type, path=path)
return copied_files
Expand Down
18 changes: 10 additions & 8 deletions dags/common/utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
from datetime import date, datetime
import io
import json
import os
import re
import tarfile
import xml.etree.ElementTree as ET
import zipfile
from datetime import date, datetime
from ftplib import error_perm
from io import StringIO
from stat import S_ISDIR, S_ISREG
from inspire_utils.record import get_value

import backoff
import country_converter as coco
Expand All @@ -20,7 +19,6 @@
BY_PATTERN,
CDATA_PATTERN,
COUNTRIES_DEFAULT_MAPPING,
COUNTRY_PARSING_PATTERN,
CREATIVE_COMMONS_PATTERN,
INSTITUTIONS_AND_COUNTRIES_MAPPING,
LICENSE_PATTERN,
Expand All @@ -30,11 +28,13 @@
UnknownFileExtension,
UnknownLicense,
)
from inspire_utils.record import get_value
from structlog import get_logger

logger = get_logger()
cc = coco.CountryConverter()


def set_harvesting_interval(repo, **kwargs):
if (
"params" in kwargs
Expand Down Expand Up @@ -268,7 +268,7 @@ def iterate_element(item):
iterate_element(item)

title_part = [i for i in title_parts if i]
full_text = ' '.join(title_part).strip()
full_text = " ".join(title_part).strip()

return full_text

Expand Down Expand Up @@ -311,10 +311,12 @@ def parse_country_from_value(affiliation_value):
country_code = cc.convert(country, to="iso2")
mapped_countries = []
if country_code != "not found":
mapped_countries = [{
"code": country_code,
"name": cc.convert(country, to="name_short"),
}]
mapped_countries = [
{
"code": country_code,
"name": cc.convert(country, to="name_short"),
}
]

if len(mapped_countries) > 1 or len(mapped_countries) == 0:
raise FoundMoreThanOneMatchOrNone(affiliation_value)
Expand Down
6 changes: 2 additions & 4 deletions dags/elsevier/elsevier_pull_sftp.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ def elsevier_pull_sftp():

@task(executor_config=kubernetes_executor_config)
def migrate_from_ftp(
sftp = ElsevierSFTPService(),
repo = ElsevierRepository(),
**kwargs
sftp=ElsevierSFTPService(), repo=ElsevierRepository(), **kwargs
):
params = kwargs["params"]
specific_files = (
Expand All @@ -44,7 +42,7 @@ def migrate_from_ftp(

@task(executor_config=kubernetes_executor_config)
def trigger_file_processing(
repo = ElsevierRepository(),
repo=ElsevierRepository(),
filenames=None,
):
return trigger_file_processing_elsevier(
Expand Down
4 changes: 2 additions & 2 deletions dags/executor_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
resources=k8s.V1ResourceRequirements(
requests={"memory": "1500Mi"},
limits={"memory": "2Gi"},
)
),
)
],
)
),
}
}
Loading

0 comments on commit 1dd37aa

Please sign in to comment.