diff --git a/ena-submission/config/defaults.yaml b/ena-submission/config/defaults.yaml index 16d959bfe..b1acab7ae 100644 --- a/ena-submission/config/defaults.yaml +++ b/ena-submission/config/defaults.yaml @@ -30,7 +30,7 @@ metadata_mapping: 'geographic location (country and/or sea)': loculus_fields: [geoLocCountry] 'geographic location (region and locality)': - loculus_fields: [geoLocAdmin1] + loculus_fields: [geoLocAdmin1, geoLocAdmin2, geoLocCity] 'sample capture status': loculus_fields: [purposeOfSampling] 'host disease outcome': diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py index c12a3aa57..ccf61e67f 100644 --- a/ena-submission/src/ena_deposition/create_assembly.py +++ b/ena-submission/src/ena_deposition/create_assembly.py @@ -119,6 +119,7 @@ def create_manifest_object( address.get("country"), ] address_string = ", ".join([x for x in address_list if x is not None]) + logging.debug("Created address from group_info") except Exception as e: logging.error(f"Was unable to create address, setting address to center_name due to {e}") @@ -126,15 +127,18 @@ def create_manifest_object( unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"] organism_metadata = config.organisms[group_key["organism"]]["enaDeposition"] chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key) + logging.debug("Created chromosome list object") chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir) + logging.debug("Created chromosome list file") authors = ( metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown") ) try: authors = reformat_authors_from_loculus_to_embl_style(authors) - except ValueError as err: + logging.debug("Reformatted authors") + except Exception as err: msg = f"Was unable to format authors: {authors} as ENA expects" - logger.error(msg) + logging.error(msg) raise ValueError(msg) from err collection_date = metadata.get("sampleCollectionDate", "Unknown") country = metadata.get("geoLocCountry", "Unknown") @@ -163,6 +167,7 @@ def create_manifest_object( organism=organism, dir=dir, ) + logging.debug("Created flatfile") program = ( metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown" ) diff --git a/ena-submission/src/ena_deposition/create_sample.py b/ena-submission/src/ena_deposition/create_sample.py index f685ea472..9b3e1f12f 100644 --- a/ena-submission/src/ena_deposition/create_sample.py +++ b/ena-submission/src/ena_deposition/create_sample.py @@ -67,7 +67,7 @@ def get_sample_attributes(config: Config, sample_metadata: dict[str, str], row: else: continue else: - value = ";".join( + value = "; ".join( [str(metadata) for metadata in loculus_metadata_field_values if metadata] ) if value: diff --git a/ena-submission/test/approved_ena_submission_list_test.json b/ena-submission/test/approved_ena_submission_list_test.json index 4139efce8..17da088e3 100644 --- a/ena-submission/test/approved_ena_submission_list_test.json +++ b/ena-submission/test/approved_ena_submission_list_test.json @@ -15,7 +15,7 @@ "bodyProduct": null, "displayName": "Pakistan/LOC_0001TLY.1/2023-08-26", "foodProduct": null, - "geoLocCity": null, + "geoLocCity": "Rawalpindi", "geoLocSite": null, "hostAgeBin": null, "hostDisease": null, @@ -36,8 +36,8 @@ "passageNumber": null, "travelHistory": null, "anatomicalPart": null, - "geoLocAdmin1": null, - "geoLocAdmin2": null, + "geoLocAdmin1": "Punjab", + "geoLocAdmin2": "Rawalpindi", "geoLocLatitude": null, "geoLocLongitude": null, "geoLocCountry": "Pakistan", diff --git a/ena-submission/test/test_sample_request.xml b/ena-submission/test/test_sample_request.xml index 7c1c63456..4911524a4 100644 --- a/ena-submission/test/test_sample_request.xml +++ b/ena-submission/test/test_sample_request.xml @@ -27,6 +27,10 @@ geographic location (country and/or sea) Pakistan + + geographic location (region and locality) + Punjab; Rawalpindi; Rawalpindi + host health state Hospital care required diff --git a/ingest/scripts/filter_out_depositions.py b/ingest/scripts/filter_out_depositions.py index 04e288152..b8c49f643 100644 --- a/ingest/scripts/filter_out_depositions.py +++ b/ingest/scripts/filter_out_depositions.py @@ -72,12 +72,14 @@ def filter_out_depositions( df = pd.read_csv(input_metadata_tsv, sep="\t", dtype=str, keep_default_na=False) original_count = len(df) with open(exclude_insdc_accessions, encoding="utf-8") as f: - loculus_insdc_accessions = [line.strip() for line in f] + loculus_insdc_accessions: set = {line.strip().split(".")[0] for line in f} # Remove version with open(exclude_biosample_accessions, encoding="utf-8") as f: loculus_biosample_accessions = [line.strip() for line in f] - filtered_df = df[~df["genbankAccession"].isin(loculus_insdc_accessions)] + filtered_df = df[ + ~df["genbankAccession"].str.split(".").str[0].isin(loculus_insdc_accessions) + ] # Filter out all versions of an accession filtered_df = filtered_df[~filtered_df["biosampleAccession"].isin(loculus_biosample_accessions)] logger.info(f"Filtered out {(original_count - len(filtered_df))} sequences.") filtered_df.to_csv(output_metadata_tsv, sep="\t", index=False) diff --git a/ingest/tests/test_data_cchf/ncbi_dataset.zip b/ingest/tests/test_data_cchf/ncbi_dataset.zip index a94f72b44..a1e627611 100644 Binary files a/ingest/tests/test_data_cchf/ncbi_dataset.zip and b/ingest/tests/test_data_cchf/ncbi_dataset.zip differ