Skip to content

Commit

Permalink
Global: Changes caused by linting
Browse files Browse the repository at this point in the history
Signed-off-by: Lorenzo Vagliano
  • Loading branch information
Lorenzovagliano committed Oct 25, 2024
1 parent 5cae23f commit dfaf38e
Show file tree
Hide file tree
Showing 63 changed files with 554 additions and 449 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Pre-Commit

on:
workflow_call:
inputs:
ref:
description: The reference to build
type: string
required: true

jobs:
linter:
runs-on: ubuntu-latest
steps:
- name: Checkout Code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10.11"

- name: Run pre-commit
uses: pre-commit/[email protected]
19 changes: 19 additions & 0 deletions .github/workflows/pull-request-main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Pull request main

on:
pull_request_target:
branches: [main]
paths-ignore: ["docs/**"]

jobs:
test:
uses: ./.github/workflows/test-and-build.yml
with:
ref: ${{ github.event.pull_request.head.sha }}
secrets: inherit

pre-commit:
uses: ./.github/workflows/pre-commit.yml
with:
ref: ${{ github.event.pull_request.head.sha }}
secrets: inherit
12 changes: 6 additions & 6 deletions .github/workflows/test-and-build.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
name: Build and Test
on:
push:
branches:
- main
pull_request_target:
branches:
- main
workflow_call:
inputs:
ref:
description: The reference to build
type: string
required: true

env:
AIRFLOW_HOME: /home/runner/work/workflows/workflows
Expand Down
12 changes: 12 additions & 0 deletions .gitlint
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[general]
ignore=body-is-missing
extra-path=./scripts/gitlint_rules/
debug=true
verbosity=3
contrib = contrib-body-requires-signed-off-by, contrib-title-conventional-commits

[title-max-length]
line-length=50

[body-max-line-length]
line-length=72
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ repos:
rev: "v2.7.1"
hooks:
- id: prettier
stages: [commit]
- repo: https://github.com/pycqa/isort
rev: "5.12.0"
hooks:
Expand All @@ -22,3 +23,11 @@ repos:
- id: check-merge-conflict
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/jorisroovers/gitlint
rev: "v0.17.0"
hooks:
- id: gitlint
language: python
entry: gitlint
stages: [commit-msg]
args: [--msg-filename]
10 changes: 5 additions & 5 deletions dags/aps/aps_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
class APSParams:
def __init__(
self,
from_date= (date.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
until_date= date.today().strftime("%Y-%m-%d"),
date= "modified",
journals= "",
set= "scoap3",
from_date=(date.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
until_date=date.today().strftime("%Y-%m-%d"),
date="modified",
journals="",
set="scoap3",
per_page: int = 100,
):
self.from_date = from_date
Expand Down
6 changes: 3 additions & 3 deletions dags/aps/aps_pull_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
)
def aps_pull_api():
@task()
def set_fetching_intervals(repo = APSRepository(), **kwargs):
def set_fetching_intervals(repo=APSRepository(), **kwargs):
return set_harvesting_interval(repo=repo, **kwargs)

@task()
def save_json_in_s3(dates: dict, repo = APSRepository(), **kwargs):
def save_json_in_s3(dates: dict, repo=APSRepository(), **kwargs):
parameters = APSParams(
from_date=dates["from_date"],
until_date=dates["until_date"],
Expand All @@ -40,7 +40,7 @@ def save_json_in_s3(dates: dict, repo = APSRepository(), **kwargs):
return None

@task()
def trigger_files_processing(key, repo = APSRepository()):
def trigger_files_processing(key, repo=APSRepository()):
if key is None:
logging.warning("No new files were downloaded to s3")
return
Expand Down
13 changes: 7 additions & 6 deletions dags/aps/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,27 @@ def _form_authors(self, article):
if author["type"] == "Person"
]


def extract_organization_and_ror(self, text):
pattern = r'<a href="([^"]+)">(.*?)</a>'

ror_url = None

def replace_and_capture(match):
nonlocal ror_url
ror_url = match.group(1)
return match.group(2)

modified_text = re.sub(pattern, replace_and_capture, text)

return modified_text, ror_url

def _get_affiliations(self, article, affiliationIds):
parsed_affiliations = [
{
"value": affiliation["name"],
"organization": self.extract_organization_and_ror(affiliation["name"])[0],
"organization": self.extract_organization_and_ror(affiliation["name"])[
0
],
"ror": self.extract_organization_and_ror(affiliation["name"])[1],
}
for affiliation in article["affiliations"]
Expand Down
1 change: 0 additions & 1 deletion dags/aps/repository.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import io
import os
from typing import IO

from common.repository import IRepository
from common.s3_service import S3Service
Expand Down
3 changes: 1 addition & 2 deletions dags/clean/cleanup_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pendulum
from airflow.decorators import dag
from airflow.operators.bash import BashOperator
from airflow.operators.bash_operator import BashOperator

AIRFLOW_HOME = os.getenv("AIRFLOW_HOME")
Expand All @@ -17,7 +16,7 @@ def cleanup_logs():
bash_command=f"""
logs_dir="{logs_dir}"
find "$logs_dir" -type d -mtime +30 -exec rm -r {{}} \;
""",
""", # noqa
)


Expand Down
2 changes: 1 addition & 1 deletion dags/common/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def replace_cdata_format(text):
CDATA_PATTERN = re.compile(r"<\?CDATA(.*)\?>")
# pattern = re.compile(r'<\?CDATA\s(.*?)\s\?>', re.DOTALL)

replaced_text = CDATA_PATTERN.sub(r'<![CDATA[ \1 ]]>', text)
replaced_text = CDATA_PATTERN.sub(r"<![CDATA[ \1 ]]>", text)

return replaced_text

Expand Down
42 changes: 22 additions & 20 deletions dags/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
("Brazil", "Brazil"),
("Brasil", "Brazil"),
("Benin", "Benin"),
(u"Bénin", "Benin"),
("Bénin", "Benin"),
("Bulgaria", "Bulgaria"),
("Bosnia and Herzegovina", "Bosnia and Herzegovina"),
("Canada", "Canada"),
Expand Down Expand Up @@ -141,7 +141,7 @@
("Luxembourg", "Luxembourg"),
("Macedonia", "Macedonia"),
("Mexico", "Mexico"),
(u"México", "Mexico"),
("México", "Mexico"),
("Monaco", "Monaco"),
("Montenegro", "Montenegro"),
("Morocco", "Morocco"),
Expand All @@ -161,7 +161,7 @@
("Portugalo", "Portugal"),
("Portugal", "Portugal"),
("P.R.China", "China"),
(u"People’s Republic of China", "China"),
("People’s Republic of China", "China"),
("Republic of Belarus", "Belarus"),
("Republic of Benin", "Benin"),
("Republic of Korea", "South Korea"),
Expand All @@ -181,7 +181,7 @@
("Slovenia", "Slovenia"),
("South Africa", "South Africa"),
("Africa", "South Africa"),
(u"España", "Spain"),
("España", "Spain"),
("Spain", "Spain"),
("Sudan", "Sudan"),
("Sweden", "Sweden"),
Expand Down Expand Up @@ -233,19 +233,21 @@
]
)

INSTITUTIONS_AND_COUNTRIES_MAPPING = OrderedDict([
("INFN", "Italy"),
("European Organization for Nuclear Research", "CERN"),
("Conseil Européen pour la Recherche Nucléaire", "CERN"),
("CERN", "CERN"),
("KEK", "Japan"),
("DESY", "Germany"),
("FERMILAB", "USA"),
("FNAL", "USA"),
("SLACK", "USA"),
("Stanford Linear Accelerator Center", "USA"),
("Joint Institute for Nuclear Research", "JINR"),
("JINR", "JINR"),
("ROC", "Taiwan"),
("R.O.C", "Taiwan"),
])
INSTITUTIONS_AND_COUNTRIES_MAPPING = OrderedDict(
[
("INFN", "Italy"),
("European Organization for Nuclear Research", "CERN"),
("Conseil Européen pour la Recherche Nucléaire", "CERN"),
("CERN", "CERN"),
("KEK", "Japan"),
("DESY", "Germany"),
("FERMILAB", "USA"),
("FNAL", "USA"),
("SLACK", "USA"),
("Stanford Linear Accelerator Center", "USA"),
("Joint Institute for Nuclear Research", "JINR"),
("JINR", "JINR"),
("ROC", "Taiwan"),
("R.O.C", "Taiwan"),
]
)
9 changes: 5 additions & 4 deletions dags/common/enhancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re

from common.constants import FN_REGEX
from common.utils import parse_country_from_value, get_country_ISO_name
from common.utils import get_country_ISO_name, parse_country_from_value


class Enhancer:
Expand Down Expand Up @@ -47,7 +47,7 @@ def __construct_titles(self, item, publisher):
def __construct_authors(self, item):
# add_nations(item)
pattern_for_cern_cooperation_agreement = re.compile(
r'cooperation agreement with cern', re.IGNORECASE
r"cooperation agreement with cern", re.IGNORECASE
)
for author in item.get("authors", []):
for affiliation in author.get("affiliations", []):
Expand All @@ -65,11 +65,12 @@ def __construct_authors(self, item):
affiliation["country"] = _parsed_country

if affiliation.get("country"):
affiliation["country"] = get_country_ISO_name(affiliation["country"])
affiliation["country"] = get_country_ISO_name(
affiliation["country"]
)

return item


def __call__(self, publisher, item):
creation_date = datetime.datetime.now().isoformat()
item_copy = item.copy()
Expand Down
8 changes: 4 additions & 4 deletions dags/common/parsing/xml_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(
extra_function=lambda s: s,
prefixes=None,
all_content_between_tags=False,
remove_tags=False
remove_tags=False,
):
super().__init__(destination)

Expand Down Expand Up @@ -92,7 +92,7 @@ def __init__(
default_value=None,
required=False,
extra_function=lambda x: x,
) :
):
super().__init__(destination)
self.destination = destination
self.source = source
Expand Down Expand Up @@ -132,7 +132,7 @@ def extract(self, article):
class CustomExtractor(IExtractor):
def __init__(
self, destination, extraction_function, required=False, default_value=None
) :
):
super().__init__(destination)
self.destination = destination
self.extraction_function = extraction_function
Expand All @@ -154,7 +154,7 @@ def __init__(
destination,
value,
required=False,
) :
):
super().__init__(destination)
self.destination = destination
self.required = required
Expand Down
16 changes: 8 additions & 8 deletions dags/common/scoap3_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,19 @@

logger = get_logger()

FILE_EXTENSIONS = {
"pdf": ".pdf",
"xml": ".xml",
"pdfa": ".pdf"
}
FILE_EXTENSIONS = {"pdf": ".pdf", "xml": ".xml", "pdfa": ".pdf"}


def update_filename_extension(filename, type):
extension = FILE_EXTENSIONS.get(type, "")
if filename.endswith(extension):
return filename
elif extension:
if type == "pdfa":
extension = f".a-2b.pdf"
extension = ".a-2b.pdf"
return f"{filename}{extension}"


class Scoap3Repository(IRepository):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -55,7 +53,7 @@ def copy_file(self, source_bucket, source_key, prefix=None, type=None):
"source_key": source_key,
},
"MetadataDirective": "REPLACE",
"ACL": "public-read"
"ACL": "public-read",
},
)
logger.info(
Expand All @@ -67,7 +65,9 @@ def copy_files(self, bucket, files, prefix=None):
copied_files = {}
for type, path in files.items():
try:
copied_files[type] = self.copy_file(bucket, path, prefix=prefix, type=type)
copied_files[type] = self.copy_file(
bucket, path, prefix=prefix, type=type
)
except Exception as e:
logger.error("Failed to copy file.", error=str(e), type=type, path=path)
return copied_files
Expand Down
Loading

0 comments on commit dfaf38e

Please sign in to comment.