From 2305fe4de905ac07f2cd69f22cc317e25152e3c0 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:33:18 +0200 Subject: [PATCH 1/9] Fix process alignments - do not require nextclade dataset to include clade name. --- ingest/Snakefile | 11 +++-- ingest/scripts/process_alignments.py | 69 ++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 ingest/scripts/process_alignments.py diff --git a/ingest/Snakefile b/ingest/Snakefile index 06de63284..7a070e548 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -150,18 +150,21 @@ rule align: rule process_alignments: input: + script="scripts/process_alignments.py", results=expand( "results/nextclade_{segment}.tsv", segment=config["nucleotide_sequences"], ), output: merged="results/nextclade_merged.tsv", + params: + log_level=LOG_LEVEL, shell: """ - tsv-append --header {input.results} \ - | tsv-select --header --fields seqName,clade \ - | tsv-filter --header --not-empty clade \ - > {output.merged} + python {input.script} \ + --input "{input.results}" \ + --output {output.merged} \ + --log-level {params.log_level} \ """ diff --git a/ingest/scripts/process_alignments.py b/ingest/scripts/process_alignments.py new file mode 100644 index 000000000..613fabef0 --- /dev/null +++ b/ingest/scripts/process_alignments.py @@ -0,0 +1,69 @@ +import csv +import os +import pandas as pd +import logging +import sys + +import click + + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.DEBUG, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + +# https://stackoverflow.com/questions/15063936 +csv.field_size_limit(sys.maxsize) + + +def validate_paths(ctx, param, value): + """Custom validation function to check if all provided paths exist.""" + paths = value.split(" ") + for path in paths: + if not os.path.exists(path): + msg = f"Path does not exist: {path}" + raise click.BadParameter(msg) + return paths + + +@click.command() +@click.option( + "--input", + required=True, + callback=validate_paths, + help="List of paths to alignment files.", +) +@click.option("--output", required=True, type=click.Path()) +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +def main( + input: str, + output: str, + log_level: str, +) -> None: + logger.setLevel(log_level) + + appended_df = pd.DataFrame({"seqName": [], "clade": []}) + + for alignment_path in input: + df = pd.read_csv(alignment_path, sep="\t", dtype=str) + seq_clade = df[["seqName", "qc.overallStatus"]] + # drop all rows that do not contain a qc.overallStatus - i.e. did not align to a segment + seq_clade = seq_clade.dropna(subset=["qc.overallStatus"]) + segment_name = (alignment_path.split(".")[-2]).split("_")[-1] + seq_clade_named = seq_clade[["seqName"]] + seq_clade_named["clade"] = segment_name + appended_df = appended_df._append(seq_clade_named, ignore_index=True) + + # saving as tsv file + appended_df.to_csv(output, sep="\t", index=False) + + +if __name__ == "__main__": + main() From bb95e519efe69f55bd7024fb03cd53f4c72a1286 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 18 Jul 2024 12:13:05 +0200 Subject: [PATCH 2/9] Make ncbi_virus_tax_id perSegment to work around prepro issues. --- ingest/config/defaults.yaml | 5 +---- kubernetes/loculus/values.yaml | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 6c3f0016d..79fde5dba 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -38,7 +38,6 @@ rename: ncbi_sra_accessions: sra_run_accession ncbi_submitter_affiliation: author_affiliations ncbi_submitter_names: authors - # Fields that are shared at sample level # Used to deduplicate samples # Used for segmented viruses only @@ -56,14 +55,12 @@ shared_fields: - authors - ncbi_release_date #TODO (#2171): Allow segments to have different dates - ncbi_update_date - # Fields that that are not shared at sample level # But specific to each segment segment_specific_fields: - - biosample_accession # Should usually be same for each segment - - bioproject_accessions # Should usually be same for each segment? - sra_run_accession # Usually the same for each segment? - ncbi_protein_count + - ncbi_virus_tax_id - insdc_accession_base - insdc_version - insdc_accession_full diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index dbebffb0a..110bb6f64 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -646,6 +646,7 @@ defaultOrganismConfig: &defaultOrganismConfig header: "INSDC" - name: ncbi_virus_tax_id type: int + perSegment: true autocomplete: true customDisplay: type: link From fec0e971de8d4a1436ce1d2216bedeaa6edc7a6d Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 18 Jul 2024 15:18:56 +0200 Subject: [PATCH 3/9] Allow options to group segmented metadata values under the same header. --- kubernetes/loculus/templates/_common-metadata.tpl | 4 ++++ kubernetes/loculus/values.yaml | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/kubernetes/loculus/templates/_common-metadata.tpl b/kubernetes/loculus/templates/_common-metadata.tpl index 8dcd38dbf..0dd690526 100644 --- a/kubernetes/loculus/templates/_common-metadata.tpl +++ b/kubernetes/loculus/templates/_common-metadata.tpl @@ -187,7 +187,11 @@ fields: {{- if .displayName }} displayName: {{ printf "%s %s" .displayName $segment | quote }} {{- end }} + {{- if (default false .oneHeader)}} + header: {{ (default "Other" .header) | quote }} + {{- else }} header: {{ printf "%s %s" (default "Other" .header) $segment | quote }} + {{- end }} {{- end }} {{- end }} {{- else }} diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 110bb6f64..cd0a5655d 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -153,12 +153,14 @@ defaultOrganismConfig: &defaultOrganismConfig hideOnSequenceDetailsPage: true noInput: true perSegment: true + oneHeader: true - name: insdc_version type: int header: "INSDC" hideOnSequenceDetailsPage: true noInput: true perSegment: true + oneHeader: true - name: insdc_accession_full displayName: INSDC accession customDisplay: @@ -168,6 +170,7 @@ defaultOrganismConfig: &defaultOrganismConfig ingest: genbank_accession noInput: true perSegment: true + oneHeader: true - name: bioproject_accessions customDisplay: type: link @@ -183,6 +186,7 @@ defaultOrganismConfig: &defaultOrganismConfig header: "INSDC" noInput: true perSegment: true + oneHeader: true - name: culture_id displayName: Culture ID header: Sample details @@ -647,6 +651,7 @@ defaultOrganismConfig: &defaultOrganismConfig - name: ncbi_virus_tax_id type: int perSegment: true + oneHeader: true autocomplete: true customDisplay: type: link From e3201369d215fae59d76ef25ed1b425cefb71dda Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 18 Jul 2024 15:43:07 +0200 Subject: [PATCH 4/9] Small webpage clean up. --- .../components/SequenceDetailsPage/DataTableEntryValue.tsx | 2 +- .../SequenceDetailsPage/ReferenceSequenceLinkButton.tsx | 2 +- .../src/components/SequenceDetailsPage/getDataTableData.ts | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx b/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx index b65d1e8b4..d5657f448 100644 --- a/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx +++ b/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx @@ -15,7 +15,7 @@ const CustomDisplayComponent: React.FC = ({ data, dataUseTermsHistory }) return (
-
+
{!customDisplay && (value !== '' ? value : None)} {customDisplay?.type === 'badge' && (customDisplay.value === undefined ? ( diff --git a/website/src/components/SequenceDetailsPage/ReferenceSequenceLinkButton.tsx b/website/src/components/SequenceDetailsPage/ReferenceSequenceLinkButton.tsx index d72a04faf..1c1b29607 100644 --- a/website/src/components/SequenceDetailsPage/ReferenceSequenceLinkButton.tsx +++ b/website/src/components/SequenceDetailsPage/ReferenceSequenceLinkButton.tsx @@ -81,7 +81,7 @@ const ReferenceSequenceLinkButton: React.FC = ({ reference }) => { currElement.insdc_accession_full !== undefined && (
{isMultiSegmented && ( -
+
{currElement.name}:
)} diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts index 5f9b7a477..a3e143d1b 100644 --- a/website/src/components/SequenceDetailsPage/getDataTableData.ts +++ b/website/src/components/SequenceDetailsPage/getDataTableData.ts @@ -43,6 +43,11 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa result.topmatter.sequenceDisplayName = entry.value.toString(); continue; } + let regex = new RegExp('^length'); + + if (entry.type.kind === 'metadata' && regex.test(entry.name) && entry.value === 0) { + continue; + } if (!tableHeaderMap.has(entry.header)) { tableHeaderMap.set(entry.header, []); From 9d6fb7c7939e9d4b343e40b808547f321f145b45 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 18 Jul 2024 15:50:42 +0200 Subject: [PATCH 5/9] Formatting fix. --- .../src/components/SequenceDetailsPage/DataTableEntryValue.tsx | 2 +- website/src/components/SequenceDetailsPage/getDataTableData.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx b/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx index d5657f448..54a72b58b 100644 --- a/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx +++ b/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx @@ -15,7 +15,7 @@ const CustomDisplayComponent: React.FC = ({ data, dataUseTermsHistory }) return (
-
+
{!customDisplay && (value !== '' ? value : None)} {customDisplay?.type === 'badge' && (customDisplay.value === undefined ? ( diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts index a3e143d1b..9b361e101 100644 --- a/website/src/components/SequenceDetailsPage/getDataTableData.ts +++ b/website/src/components/SequenceDetailsPage/getDataTableData.ts @@ -43,7 +43,7 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa result.topmatter.sequenceDisplayName = entry.value.toString(); continue; } - let regex = new RegExp('^length'); + const regex = new RegExp('^length'); if (entry.type.kind === 'metadata' && regex.test(entry.name) && entry.value === 0) { continue; From e012c4766fc38f531127178f0ed9302fe085ee9d Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 22 Jul 2024 17:06:01 +0200 Subject: [PATCH 6/9] Add more logging. --- ingest/scripts/group_segments.py | 3 +++ ingest/scripts/process_alignments.py | 1 + 2 files changed, 4 insertions(+) diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py index b8018badc..d87b807d1 100644 --- a/ingest/scripts/group_segments.py +++ b/ingest/scripts/group_segments.py @@ -235,11 +235,13 @@ def main( logging.info(f"Wrote grouped metadata for {len(metadata)} sequences") count = 0 + count_ignored = 0 for record in orjsonl.stream(input_seq): accession = record["id"] raw_sequence = record["sequence"] if accession not in fasta_id_map: logger.warning(f"Accession {accession} not found in input sequence file, skipping") + count_ignored += 1 continue orjsonl.append( output_seq, @@ -250,6 +252,7 @@ def main( ) count += 1 logging.info(f"Wrote {count} sequences") + logging.info(f"Ignored {count_ignored} sequences as not found in {input_seq}") if __name__ == "__main__": diff --git a/ingest/scripts/process_alignments.py b/ingest/scripts/process_alignments.py index 613fabef0..af95b8f80 100644 --- a/ingest/scripts/process_alignments.py +++ b/ingest/scripts/process_alignments.py @@ -63,6 +63,7 @@ def main( # saving as tsv file appended_df.to_csv(output, sep="\t", index=False) + logging.info(f"Kept {len(appended_df.index)} sequences where segment assignment was possible.") if __name__ == "__main__": From 17cb8e67047f9b59bae512ec29ea8c97c1917e52 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 22 Jul 2024 17:08:07 +0200 Subject: [PATCH 7/9] Allow ncbi_update_date to be different per segment. --- ingest/config/defaults.yaml | 1 + kubernetes/loculus/values.yaml | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 79fde5dba..6cb0204a5 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -65,6 +65,7 @@ segment_specific_fields: - insdc_version - insdc_accession_full - hash + - ncbi_update_date all_fields: - accession - bioprojects diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index cd0a5655d..c226ba0c8 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -92,6 +92,8 @@ defaultOrganismConfig: &defaultOrganismConfig inputs: timestamp: ncbi_update_date noInput: true + perSegment: true + oneHeader: true - name: geo_loc_country displayName: Collection country generateIndex: true @@ -1212,5 +1214,6 @@ runDevelopmentKeycloakDatabase: true runDevelopmentMainDatabase: true enforceHTTPS: true registrationTermsMessage: > - You must agree to the terms of use. -subdomainSeparator: "-" \ No newline at end of file + You must agree to the terms of use. + +subdomainSeparator: "-" From 3b97f90e6b1bc933433155138cc2515a3864c57d Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:53:41 +0200 Subject: [PATCH 8/9] Do not make ncbi_virus_tax_id segment-specific by default. --- ingest/config/defaults.yaml | 1 - kubernetes/loculus/values.yaml | 2 -- 2 files changed, 3 deletions(-) diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 6cb0204a5..d3a79dab4 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -60,7 +60,6 @@ shared_fields: segment_specific_fields: - sra_run_accession # Usually the same for each segment? - ncbi_protein_count - - ncbi_virus_tax_id - insdc_accession_base - insdc_version - insdc_accession_full diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index c226ba0c8..46718007f 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -652,8 +652,6 @@ defaultOrganismConfig: &defaultOrganismConfig header: "INSDC" - name: ncbi_virus_tax_id type: int - perSegment: true - oneHeader: true autocomplete: true customDisplay: type: link From 2339615e168bc32aab3ecfa5584b0fd06708c3f3 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Tue, 23 Jul 2024 16:33:55 +0200 Subject: [PATCH 9/9] feat(ingest): Use tsv-utils again (4 lines) instead of Python script (70 lines + extra file) (#2329) * This might work already * fix --- ingest/Snakefile | 20 +++++--- ingest/scripts/process_alignments.py | 70 ---------------------------- 2 files changed, 14 insertions(+), 76 deletions(-) delete mode 100644 ingest/scripts/process_alignments.py diff --git a/ingest/Snakefile b/ingest/Snakefile index 7a070e548..15198c7e0 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -150,7 +150,6 @@ rule align: rule process_alignments: input: - script="scripts/process_alignments.py", results=expand( "results/nextclade_{segment}.tsv", segment=config["nucleotide_sequences"], @@ -158,13 +157,22 @@ rule process_alignments: output: merged="results/nextclade_merged.tsv", params: - log_level=LOG_LEVEL, + # -f segment_name1=segment_path1 - segment_name2=segment_path2 + # to do source tracking with tsv-append + # https://github.com/eBay/tsv-utils/blob/master/docs/tool_reference/tsv-append.md + segment_paths=" ".join( + [ + f"-f {segment}=results/nextclade_{segment}.tsv" + for segment in config["nucleotide_sequences"] + ] + ), shell: """ - python {input.script} \ - --input "{input.results}" \ - --output {output.merged} \ - --log-level {params.log_level} \ + tsv-append --header --source-header segment \ + {params.segment_paths} \ + | tsv-filter --header --not-empty alignmentScore \ + | tsv-select --header --fields seqName,segment \ + > {output.merged} """ diff --git a/ingest/scripts/process_alignments.py b/ingest/scripts/process_alignments.py deleted file mode 100644 index af95b8f80..000000000 --- a/ingest/scripts/process_alignments.py +++ /dev/null @@ -1,70 +0,0 @@ -import csv -import os -import pandas as pd -import logging -import sys - -import click - - -logger = logging.getLogger(__name__) -logging.basicConfig( - encoding="utf-8", - level=logging.DEBUG, - format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", - datefmt="%H:%M:%S", -) - -# https://stackoverflow.com/questions/15063936 -csv.field_size_limit(sys.maxsize) - - -def validate_paths(ctx, param, value): - """Custom validation function to check if all provided paths exist.""" - paths = value.split(" ") - for path in paths: - if not os.path.exists(path): - msg = f"Path does not exist: {path}" - raise click.BadParameter(msg) - return paths - - -@click.command() -@click.option( - "--input", - required=True, - callback=validate_paths, - help="List of paths to alignment files.", -) -@click.option("--output", required=True, type=click.Path()) -@click.option( - "--log-level", - default="INFO", - type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), -) -def main( - input: str, - output: str, - log_level: str, -) -> None: - logger.setLevel(log_level) - - appended_df = pd.DataFrame({"seqName": [], "clade": []}) - - for alignment_path in input: - df = pd.read_csv(alignment_path, sep="\t", dtype=str) - seq_clade = df[["seqName", "qc.overallStatus"]] - # drop all rows that do not contain a qc.overallStatus - i.e. did not align to a segment - seq_clade = seq_clade.dropna(subset=["qc.overallStatus"]) - segment_name = (alignment_path.split(".")[-2]).split("_")[-1] - seq_clade_named = seq_clade[["seqName"]] - seq_clade_named["clade"] = segment_name - appended_df = appended_df._append(seq_clade_named, ignore_index=True) - - # saving as tsv file - appended_df.to_csv(output, sep="\t", index=False) - logging.info(f"Kept {len(appended_df.index)} sequences where segment assignment was possible.") - - -if __name__ == "__main__": - main()