diff --git a/ena-submission/config/defaults.yaml b/ena-submission/config/defaults.yaml
index 16d959bfe..b1acab7ae 100644
--- a/ena-submission/config/defaults.yaml
+++ b/ena-submission/config/defaults.yaml
@@ -30,7 +30,7 @@ metadata_mapping:
'geographic location (country and/or sea)':
loculus_fields: [geoLocCountry]
'geographic location (region and locality)':
- loculus_fields: [geoLocAdmin1]
+ loculus_fields: [geoLocAdmin1, geoLocAdmin2, geoLocCity]
'sample capture status':
loculus_fields: [purposeOfSampling]
'host disease outcome':
diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py
index c12a3aa57..ccf61e67f 100644
--- a/ena-submission/src/ena_deposition/create_assembly.py
+++ b/ena-submission/src/ena_deposition/create_assembly.py
@@ -119,6 +119,7 @@ def create_manifest_object(
address.get("country"),
]
address_string = ", ".join([x for x in address_list if x is not None])
+ logging.debug("Created address from group_info")
except Exception as e:
logging.error(f"Was unable to create address, setting address to center_name due to {e}")
@@ -126,15 +127,18 @@ def create_manifest_object(
unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"]
organism_metadata = config.organisms[group_key["organism"]]["enaDeposition"]
chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key)
+ logging.debug("Created chromosome list object")
chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object, dir=dir)
+ logging.debug("Created chromosome list file")
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
try:
authors = reformat_authors_from_loculus_to_embl_style(authors)
- except ValueError as err:
+ logging.debug("Reformatted authors")
+ except Exception as err:
msg = f"Was unable to format authors: {authors} as ENA expects"
- logger.error(msg)
+ logging.error(msg)
raise ValueError(msg) from err
collection_date = metadata.get("sampleCollectionDate", "Unknown")
country = metadata.get("geoLocCountry", "Unknown")
@@ -163,6 +167,7 @@ def create_manifest_object(
organism=organism,
dir=dir,
)
+ logging.debug("Created flatfile")
program = (
metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown"
)
diff --git a/ena-submission/src/ena_deposition/create_sample.py b/ena-submission/src/ena_deposition/create_sample.py
index f685ea472..9b3e1f12f 100644
--- a/ena-submission/src/ena_deposition/create_sample.py
+++ b/ena-submission/src/ena_deposition/create_sample.py
@@ -67,7 +67,7 @@ def get_sample_attributes(config: Config, sample_metadata: dict[str, str], row:
else:
continue
else:
- value = ";".join(
+ value = "; ".join(
[str(metadata) for metadata in loculus_metadata_field_values if metadata]
)
if value:
diff --git a/ena-submission/test/approved_ena_submission_list_test.json b/ena-submission/test/approved_ena_submission_list_test.json
index 4139efce8..17da088e3 100644
--- a/ena-submission/test/approved_ena_submission_list_test.json
+++ b/ena-submission/test/approved_ena_submission_list_test.json
@@ -15,7 +15,7 @@
"bodyProduct": null,
"displayName": "Pakistan/LOC_0001TLY.1/2023-08-26",
"foodProduct": null,
- "geoLocCity": null,
+ "geoLocCity": "Rawalpindi",
"geoLocSite": null,
"hostAgeBin": null,
"hostDisease": null,
@@ -36,8 +36,8 @@
"passageNumber": null,
"travelHistory": null,
"anatomicalPart": null,
- "geoLocAdmin1": null,
- "geoLocAdmin2": null,
+ "geoLocAdmin1": "Punjab",
+ "geoLocAdmin2": "Rawalpindi",
"geoLocLatitude": null,
"geoLocLongitude": null,
"geoLocCountry": "Pakistan",
diff --git a/ena-submission/test/test_sample_request.xml b/ena-submission/test/test_sample_request.xml
index 7c1c63456..4911524a4 100644
--- a/ena-submission/test/test_sample_request.xml
+++ b/ena-submission/test/test_sample_request.xml
@@ -27,6 +27,10 @@
geographic location (country and/or sea)
Pakistan
+
+ geographic location (region and locality)
+ Punjab; Rawalpindi; Rawalpindi
+
host health state
Hospital care required
diff --git a/ingest/scripts/filter_out_depositions.py b/ingest/scripts/filter_out_depositions.py
index 04e288152..b8c49f643 100644
--- a/ingest/scripts/filter_out_depositions.py
+++ b/ingest/scripts/filter_out_depositions.py
@@ -72,12 +72,14 @@ def filter_out_depositions(
df = pd.read_csv(input_metadata_tsv, sep="\t", dtype=str, keep_default_na=False)
original_count = len(df)
with open(exclude_insdc_accessions, encoding="utf-8") as f:
- loculus_insdc_accessions = [line.strip() for line in f]
+ loculus_insdc_accessions: set = {line.strip().split(".")[0] for line in f} # Remove version
with open(exclude_biosample_accessions, encoding="utf-8") as f:
loculus_biosample_accessions = [line.strip() for line in f]
- filtered_df = df[~df["genbankAccession"].isin(loculus_insdc_accessions)]
+ filtered_df = df[
+ ~df["genbankAccession"].str.split(".").str[0].isin(loculus_insdc_accessions)
+ ] # Filter out all versions of an accession
filtered_df = filtered_df[~filtered_df["biosampleAccession"].isin(loculus_biosample_accessions)]
logger.info(f"Filtered out {(original_count - len(filtered_df))} sequences.")
filtered_df.to_csv(output_metadata_tsv, sep="\t", index=False)
diff --git a/ingest/tests/test_data_cchf/ncbi_dataset.zip b/ingest/tests/test_data_cchf/ncbi_dataset.zip
index a94f72b44..a1e627611 100644
Binary files a/ingest/tests/test_data_cchf/ncbi_dataset.zip and b/ingest/tests/test_data_cchf/ncbi_dataset.zip differ