From 34be11c76ecd13544df3b4393adae3b7919ead18 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 10 Jun 2024 17:21:21 +0200 Subject: [PATCH] Format with ruff, fix mypy errors, update dev_dependencies --- preprocessing/nextclade/dev_dependencies.txt | 2 ++ .../src/loculus_preprocessing/backend.py | 10 +++++----- .../src/loculus_preprocessing/config.py | 2 +- .../src/loculus_preprocessing/prepro.py | 20 ++++++++++--------- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/preprocessing/nextclade/dev_dependencies.txt b/preprocessing/nextclade/dev_dependencies.txt index c532e08f3..fc02661e0 100644 --- a/preprocessing/nextclade/dev_dependencies.txt +++ b/preprocessing/nextclade/dev_dependencies.txt @@ -2,3 +2,5 @@ mypy ruff types-PyYAML types-requests +types-pytz +types-python-dateutil \ No newline at end of file diff --git a/preprocessing/nextclade/src/loculus_preprocessing/backend.py b/preprocessing/nextclade/src/loculus_preprocessing/backend.py index efd02b5ef..8a6028b56 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/backend.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/backend.py @@ -1,17 +1,17 @@ """Functions to interface with the backend""" +import dataclasses import datetime as dt +import json import logging +import time +from collections.abc import Sequence from http import HTTPStatus -import dataclasses -import json from pathlib import Path import jwt import pytz import requests -import time -from collections.abc import Sequence from .config import Config from .datatypes import ( @@ -76,7 +76,7 @@ def fetch_unprocessed_sequences(n: int, config: Config) -> str: if response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY: logging.debug(f"{response.text}.\nSleeping for a while.") time.sleep(60 * 1) - return [] + return "" msg = f"Fetching unprocessed data failed. Status code: { response.status_code}" raise Exception( diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py index 0c3647236..b32ea1f78 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/config.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/config.py @@ -18,7 +18,7 @@ @dataclass class Config: organism: str = "mpox" - backend_host: str = None # Set default to None or similar placeholder + backend_host: str = "" keycloak_host: str = "http://127.0.0.1:8083" keycloak_user: str = "preprocessing_pipeline" keycloak_password: str = "preprocessing_pipeline" diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index e53f44a66..d71c61164 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -21,7 +21,6 @@ AnnotationSource, AnnotationSourceType, GeneName, - SegmentName, InputMetadata, NucleotideInsertion, NucleotideSequence, @@ -31,6 +30,7 @@ ProcessingAnnotation, ProcessingResult, ProcessingSpec, + SegmentName, UnprocessedAfterNextclade, UnprocessedData, UnprocessedEntry, @@ -195,7 +195,10 @@ def enrich_with_nextclade( logging.debug("Nextclade results available in %s", result_dir) # Add aligned sequences to aligned_nucleotide_sequences - load_aligned_nuc_sequences(result_dir_seg, segment, aligned_nucleotide_sequences) + # Modifies aligned_nucleotide_sequences in place + aligned_nucleotide_sequences = load_aligned_nuc_sequences( + result_dir_seg, segment, aligned_nucleotide_sequences + ) for gene in config.genes: translation_path = result_dir_seg + f"/nextclade.cds_translation.{gene}.fasta" @@ -273,7 +276,7 @@ def load_aligned_nuc_sequences( aligned_nucleotide_sequences: dict[ AccessionVersion, dict[SegmentName, NucleotideSequence | None] ], -) -> dict[AccessionVersion, NucleotideSequence]: +) -> dict[AccessionVersion, dict[SegmentName, NucleotideSequence | None]]: """ Load the nextclade alignment results into the aligned_nucleotide_sequences dict, mapping each accession to a segmentName: NucleotideSequence dictionary. @@ -284,6 +287,7 @@ def load_aligned_nuc_sequences( sequence_id: str = aligned_sequence.id sequence: NucleotideSequence = str(aligned_sequence.seq) aligned_nucleotide_sequences[sequence_id][segment] = mask_terminal_gaps(sequence) + return aligned_nucleotide_sequences def accession_from_str(id_str: AccessionVersion) -> str: @@ -318,6 +322,8 @@ def get_metadata( nextclade_prefix = "nextclade." if input_path.startswith(nextclade_prefix): # Remove "nextclade." prefix + if spec.args is None: + spec.args = {} segment = spec.args.get("segment", "main") if unprocessed.nextcladeMetadata is None: errors.append( @@ -374,15 +380,11 @@ def process_single( """Process a single sequence per config""" errors: list[ProcessingAnnotation] = [] warnings: list[ProcessingAnnotation] = [] - len_dict: dict[str, str | int] = {} + output_metadata: ProcessedMetadata = {} for segment in config.nucleotideSequences: sequence = unprocessed.unalignedNucleotideSequences[segment] key = "length" if segment == "main" else "length_" + segment - if sequence: - len_dict[key] = len(sequence) - else: - len_dict[key] = 0 - output_metadata: ProcessedMetadata = len_dict + output_metadata[key] = len(sequence) if sequence else 0 for output_field, spec_dict in config.processing_spec.items(): length_fields = [