From 447494266ee251b686dc8689c1c453bdc2cea01b Mon Sep 17 00:00:00 2001 From: david-i-berry Date: Fri, 6 Dec 2024 10:54:37 +0100 Subject: [PATCH 1/4] Addresses: - #32 (http 200 check) - #30 (integrity field) - #27 (hash function) Fixes path issues in Dockerfile. --- docker/Dockerfile | 11 ++++++----- wis2downloader/downloader/__init__.py | 28 ++++++++++++++++++++------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 18c1392..d296d81 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -44,10 +44,10 @@ RUN source /home/wis2downloader/.venv/bin/activate && \ USER root # Now copy files COPY . /home/wis2downloader/tmp -COPY ./docker/config/. /home/wis2downloader/app/config -COPY ./docker/entrypoint.sh /home/wis2downloader/app/entrypoint.sh -COPY ./docker/clean_downloads.cron /home/wis2downloader/app/clean_downloads.cron -COPY ./docker/clean_downloads.py /home/wis2downloader/app/clean_downloads.py +COPY config/. /home/wis2downloader/app/config +COPY entrypoint.sh /home/wis2downloader/app/entrypoint.sh +COPY clean_downloads.cron /home/wis2downloader/app/clean_downloads.cron +COPY clean_downloads.py /home/wis2downloader/app/clean_downloads.py # set ownership / permisssions RUN chown -R wis2downloader /home/wis2downloader/tmp && \ @@ -59,7 +59,8 @@ RUN chown -R wis2downloader /home/wis2downloader/tmp && \ USER wis2downloader WORKDIR /home/wis2downloader/tmp RUN source /home/wis2downloader/.venv/bin/activate && \ - python -m pip install --no-cache-dir . + python -m pip install wis2downloader + # clean up \ WORKDIR /home/wis2downloader/ RUN rm -R /home/wis2downloader/tmp diff --git a/wis2downloader/downloader/__init__.py b/wis2downloader/downloader/__init__.py index a05717b..8dfb680 100644 --- a/wis2downloader/downloader/__init__.py +++ b/wis2downloader/downloader/__init__.py @@ -135,10 +135,9 @@ def process_job(self, job) -> None: # Get information about the job for verification later expected_hash, hash_function = self.get_hash_info(job) - expected_size = job.get('payload', {}).get('content', {}).get('size') # Get the download url, update status, and file type from the job links - _url, update, media_type = self.get_download_url(job) + _url, update, media_type, expected_size = self.get_download_url(job) if _url is None: LOGGER.warning(f"No download link found in job {job}") @@ -182,6 +181,12 @@ def process_job(self, job) -> None: response = None try: response = self.http.request('GET', _url) + if response.status != 200: + LOGGER.error(f"Error fetching file from {_url}.") + LOGGER.error(f".... Status code: {response.status}") + LOGGER.error(f".... Content: {response.data}") + FAILED_DOWNLOADS.labels(topic=topic, centre_id=centre_id).inc(1) + return # Get the filesize in KB filesize = len(response.data) except Exception as e: @@ -230,7 +235,7 @@ def get_topic_and_centre(self, job) -> tuple: def get_hash_info(self, job): expected_hash = job.get('payload', {}).get( - 'properties', {}).get('integrity', {}).get('hash') + 'properties', {}).get('integrity', {}).get('value') hash_method = job.get('payload', {}).get( 'properties', {}).get('integrity', {}).get('method') @@ -238,8 +243,10 @@ def get_hash_info(self, job): # Check if hash method is known using our enumumeration of hash methods if hash_method in VerificationMethods._member_names_: + # get method method = VerificationMethods[hash_method].value - hash_function = hashlib.new(method) + # load and return from the hashlib library + hash_function = getattr(hashlib, method, None) return expected_hash, hash_function @@ -248,18 +255,21 @@ def get_download_url(self, job) -> tuple: _url = None update = False media_type = None + expected_size = None for link in links: if link.get('rel') == 'update': _url = link.get('href') media_type = link.get('type') + expected_size = link.get('length') update = True break elif link.get('rel') == 'canonical': _url = link.get('href') media_type = link.get('type') + expected_size = link.get('length') break - return _url, update, media_type + return _url, update, media_type, expected_size def extract_filename(self, _url) -> tuple: path = urlsplit(_url).path @@ -273,8 +283,12 @@ def validate_data(self, data, expected_hash, hash_function): return True - hash_value = hash_function(data).digest() - hash_value = base64.b64encode(hash_value).decode() + try: + hash_value = hash_function(data).digest() + hash_value = base64.b64encode(hash_value).decode() + except Exception as e: + LOGGER.error(e) + return False if (hash_value != expected_hash) or (len(data) != expected_size): return False From 1b0167c434ed2fc32963da08a9a4db7557b6916c Mon Sep 17 00:00:00 2001 From: david-i-berry Date: Fri, 6 Dec 2024 10:58:52 +0100 Subject: [PATCH 2/4] Alignment with recent PR to address #32. --- wis2downloader/downloader/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/wis2downloader/downloader/__init__.py b/wis2downloader/downloader/__init__.py index 8dfb680..be9ef5a 100644 --- a/wis2downloader/downloader/__init__.py +++ b/wis2downloader/downloader/__init__.py @@ -182,9 +182,8 @@ def process_job(self, job) -> None: try: response = self.http.request('GET', _url) if response.status != 200: - LOGGER.error(f"Error fetching file from {_url}.") - LOGGER.error(f".... Status code: {response.status}") - LOGGER.error(f".... Content: {response.data}") + LOGGER.error(f"Error downloading {_url}, received status code: {response.status}") + # Increment failed download counter FAILED_DOWNLOADS.labels(topic=topic, centre_id=centre_id).inc(1) return # Get the filesize in KB From 95efd0c593c243db08918f7ef6b4e01a2deede37 Mon Sep 17 00:00:00 2001 From: david-i-berry Date: Fri, 6 Dec 2024 10:54:37 +0100 Subject: [PATCH 3/4] Addresses: - #30 (integrity field) - #27 (hash function) Fixes path issues in Dockerfile. --- docker/Dockerfile | 11 ++++++----- wis2downloader/downloader/__init__.py | 22 +++++++++++++++------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 18c1392..d296d81 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -44,10 +44,10 @@ RUN source /home/wis2downloader/.venv/bin/activate && \ USER root # Now copy files COPY . /home/wis2downloader/tmp -COPY ./docker/config/. /home/wis2downloader/app/config -COPY ./docker/entrypoint.sh /home/wis2downloader/app/entrypoint.sh -COPY ./docker/clean_downloads.cron /home/wis2downloader/app/clean_downloads.cron -COPY ./docker/clean_downloads.py /home/wis2downloader/app/clean_downloads.py +COPY config/. /home/wis2downloader/app/config +COPY entrypoint.sh /home/wis2downloader/app/entrypoint.sh +COPY clean_downloads.cron /home/wis2downloader/app/clean_downloads.cron +COPY clean_downloads.py /home/wis2downloader/app/clean_downloads.py # set ownership / permisssions RUN chown -R wis2downloader /home/wis2downloader/tmp && \ @@ -59,7 +59,8 @@ RUN chown -R wis2downloader /home/wis2downloader/tmp && \ USER wis2downloader WORKDIR /home/wis2downloader/tmp RUN source /home/wis2downloader/.venv/bin/activate && \ - python -m pip install --no-cache-dir . + python -m pip install wis2downloader + # clean up \ WORKDIR /home/wis2downloader/ RUN rm -R /home/wis2downloader/tmp diff --git a/wis2downloader/downloader/__init__.py b/wis2downloader/downloader/__init__.py index 9b4a881..e5ba823 100644 --- a/wis2downloader/downloader/__init__.py +++ b/wis2downloader/downloader/__init__.py @@ -135,10 +135,9 @@ def process_job(self, job) -> None: # Get information about the job for verification later expected_hash, hash_function = self.get_hash_info(job) - expected_size = job.get('payload', {}).get('content', {}).get('size') # Get the download url, update status, and file type from the job links - _url, update, media_type = self.get_download_url(job) + _url, update, media_type, expected_size = self.get_download_url(job) if _url is None: LOGGER.warning(f"No download link found in job {job}") @@ -236,7 +235,7 @@ def get_topic_and_centre(self, job) -> tuple: def get_hash_info(self, job): expected_hash = job.get('payload', {}).get( - 'properties', {}).get('integrity', {}).get('hash') + 'properties', {}).get('integrity', {}).get('value') hash_method = job.get('payload', {}).get( 'properties', {}).get('integrity', {}).get('method') @@ -244,8 +243,10 @@ def get_hash_info(self, job): # Check if hash method is known using our enumumeration of hash methods if hash_method in VerificationMethods._member_names_: + # get method method = VerificationMethods[hash_method].value - hash_function = hashlib.new(method) + # load and return from the hashlib library + hash_function = getattr(hashlib, method, None) return expected_hash, hash_function @@ -254,18 +255,21 @@ def get_download_url(self, job) -> tuple: _url = None update = False media_type = None + expected_size = None for link in links: if link.get('rel') == 'update': _url = link.get('href') media_type = link.get('type') + expected_size = link.get('length') update = True break elif link.get('rel') == 'canonical': _url = link.get('href') media_type = link.get('type') + expected_size = link.get('length') break - return _url, update, media_type + return _url, update, media_type, expected_size def extract_filename(self, _url) -> tuple: path = urlsplit(_url).path @@ -279,8 +283,12 @@ def validate_data(self, data, expected_hash, hash_function): return True - hash_value = hash_function(data).digest() - hash_value = base64.b64encode(hash_value).decode() + try: + hash_value = hash_function(data).digest() + hash_value = base64.b64encode(hash_value).decode() + except Exception as e: + LOGGER.error(e) + return False if (hash_value != expected_hash) or (len(data) != expected_size): return False From da931741265ca5121b57d8123422ce6968a12171 Mon Sep 17 00:00:00 2001 From: david-i-berry Date: Fri, 6 Dec 2024 13:35:30 +0100 Subject: [PATCH 4/4] tests updated so reproducible download. --- .github/workflows/test-docker.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-docker.yml b/.github/workflows/test-docker.yml index df6c62f..2f66015 100644 --- a/.github/workflows/test-docker.yml +++ b/.github/workflows/test-docker.yml @@ -53,6 +53,8 @@ jobs: echo "Testing removing subscription" # test deleting subscriptions docker exec subscriber bash -c "source /home/wis2downloader/.venv/bin/activate && wis2downloader remove-subscription --topic cache/a/wis2/+/services/#" + # clean up, remove test download + docker exec subscriber bash -c "rm \"./app/data/downloads/$(date +'%Y')/$(date +'%m')/$(date +'%d')/cache/a/wis2/my-centre/services/downloader/openapi.bin\"" - name: Run API tests working-directory: docker/tests run: | @@ -71,12 +73,14 @@ jobs: # publish a test message docker exec publisher pywis-pubsub publish --topic cache/a/wis2/my-centre/services/downloader \ --config /pywis-pubsub/config/config.yml \ - -i test -u "http://subscriber:5000/metrics" + -i test -u "http://subscriber:5000/openapi" sleep 1s # cat file contents (check the published file has been downloaded) - cat "./data/$(date +'%Y')/$(date +'%m')/$(date +'%d')/cache/a/wis2/my-centre/services/downloader/metrics.bin" + cat "./data/$(date +'%Y')/$(date +'%m')/$(date +'%d')/cache/a/wis2/my-centre/services/downloader/openapi.bin" # test deleting subscriptions curl -X DELETE http://localhost:5000/subscriptions/cache/a/wis2/%2B/services/%23 + # clean up, remove test download + docker exec subscriber bash -c "rm \"./app/data/downloads/$(date +'%Y')/$(date +'%m')/$(date +'%d')/cache/a/wis2/my-centre/services/downloader/openapi.bin\"" - name: Shutdown working-directory: docker/tests run: |