loculus-project · anna-parker · Jul 23, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -150,18 +150,21 @@ rule align:
 
 rule process_alignments:
     input:
+        script="scripts/process_alignments.py",
         results=expand(
             "results/nextclade_{segment}.tsv",
             segment=config["nucleotide_sequences"],
         ),
     output:
         merged="results/nextclade_merged.tsv",
+    params:
+        log_level=LOG_LEVEL,
     shell:
         """
-        tsv-append --header {input.results} \
-        | tsv-select --header --fields seqName,clade \
-        | tsv-filter --header --not-empty clade \
-        > {output.merged}
+        python {input.script} \
+            --input "{input.results}" \
+            --output {output.merged} \
+            --log-level {params.log_level} \
         """
 
 

diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -38,7 +38,6 @@ rename:
   ncbi_sra_accessions: sra_run_accession
   ncbi_submitter_affiliation: author_affiliations
   ncbi_submitter_names: authors
-
 # Fields that are shared at sample level
 # Used to deduplicate samples
 # Used for segmented viruses only
@@ -56,18 +55,17 @@ shared_fields:
   - authors
   - ncbi_release_date #TODO (#2171): Allow segments to have different dates
   - ncbi_update_date
-
 # Fields that that are not shared at sample level
 # But specific to each segment
 segment_specific_fields:
-  - biosample_accession # Should usually be same for each segment
-  - bioproject_accessions # Should usually be same for each segment?
   - sra_run_accession # Usually the same for each segment?
   - ncbi_protein_count
+  - ncbi_virus_tax_id
   - insdc_accession_base
   - insdc_version
   - insdc_accession_full
   - hash
+  - ncbi_update_date
 all_fields:
   - accession
   - bioprojects

diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py
@@ -235,11 +235,13 @@ def main(
     logging.info(f"Wrote grouped metadata for {len(metadata)} sequences")
 
     count = 0
+    count_ignored = 0
     for record in orjsonl.stream(input_seq):
         accession = record["id"]
         raw_sequence = record["sequence"]
         if accession not in fasta_id_map:
             logger.warning(f"Accession {accession} not found in input sequence file, skipping")
+            count_ignored += 1
             continue
         orjsonl.append(
             output_seq,
@@ -250,6 +252,7 @@ def main(
         )
         count += 1
     logging.info(f"Wrote {count} sequences")
+    logging.info(f"Ignored {count_ignored} sequences as not found in {input_seq}")
 
 
 if __name__ == "__main__":

diff --git a/ingest/scripts/process_alignments.py b/ingest/scripts/process_alignments.py
@@ -0,0 +1,70 @@
+import csv
+import os
+import pandas as pd
+import logging
+import sys
+
+import click
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    encoding="utf-8",
+    level=logging.DEBUG,
+    format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
+    datefmt="%H:%M:%S",
+)
+
+# https://stackoverflow.com/questions/15063936
+csv.field_size_limit(sys.maxsize)
+
+
+def validate_paths(ctx, param, value):
+    """Custom validation function to check if all provided paths exist."""
+    paths = value.split(" ")
+    for path in paths:
+        if not os.path.exists(path):
+            msg = f"Path does not exist: {path}"
+            raise click.BadParameter(msg)
+    return paths
+
+
+@click.command()
+@click.option(
+    "--input",
+    required=True,
+    callback=validate_paths,
+    help="List of paths to alignment files.",
+)
+@click.option("--output", required=True, type=click.Path())
+@click.option(
+    "--log-level",
+    default="INFO",
+    type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
+)
+def main(
+    input: str,
+    output: str,
+    log_level: str,
+) -> None:
+    logger.setLevel(log_level)
+
+    appended_df = pd.DataFrame({"seqName": [], "clade": []})
+
+    for alignment_path in input:
+        df = pd.read_csv(alignment_path, sep="\t", dtype=str)
+        seq_clade = df[["seqName", "qc.overallStatus"]]
+        # drop all rows that do not contain a qc.overallStatus - i.e. did not align to a segment
+        seq_clade = seq_clade.dropna(subset=["qc.overallStatus"])
+        segment_name = (alignment_path.split(".")[-2]).split("_")[-1]
+        seq_clade_named = seq_clade[["seqName"]]
+        seq_clade_named["clade"] = segment_name
+        appended_df = appended_df._append(seq_clade_named, ignore_index=True)
+
+    # saving as tsv file
+    appended_df.to_csv(output, sep="\t", index=False)
+    logging.info(f"Kept {len(appended_df.index)} sequences where segment assignment was possible.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kubernetes/loculus/templates/_common-metadata.tpl b/kubernetes/loculus/templates/_common-metadata.tpl
@@ -187,7 +187,11 @@ fields:
   {{- if .displayName }}
   displayName: {{ printf "%s %s" .displayName $segment | quote }}
   {{- end }}
+  {{- if (default false .oneHeader)}}
+  header: {{ (default "Other" .header) | quote }}
+  {{- else }}
   header: {{ printf "%s %s" (default "Other" .header) $segment | quote }}
+  {{- end }}
 {{- end }}
 {{- end }}
 {{- else }}

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -92,6 +92,8 @@ defaultOrganismConfig: &defaultOrganismConfig
           inputs:
             timestamp: ncbi_update_date
         noInput: true
+        perSegment: true
+        oneHeader: true
       - name: geo_loc_country
         displayName: Collection country
         generateIndex: true
@@ -153,12 +155,14 @@ defaultOrganismConfig: &defaultOrganismConfig
         hideOnSequenceDetailsPage: true
         noInput: true
         perSegment: true
+        oneHeader: true
       - name: insdc_version
         type: int
         header: "INSDC"
         hideOnSequenceDetailsPage: true
         noInput: true
         perSegment: true
+        oneHeader: true
       - name: insdc_accession_full
         displayName: INSDC accession
         customDisplay:
@@ -168,6 +172,7 @@ defaultOrganismConfig: &defaultOrganismConfig
         ingest: genbank_accession
         noInput: true
         perSegment: true
+        oneHeader: true
       - name: bioproject_accessions
         customDisplay:
           type: link
@@ -183,6 +188,7 @@ defaultOrganismConfig: &defaultOrganismConfig
         header: "INSDC"
         noInput: true
         perSegment: true
+        oneHeader: true
       - name: culture_id
         displayName: Culture ID
         header: Sample details
@@ -646,6 +652,8 @@ defaultOrganismConfig: &defaultOrganismConfig
         header: "INSDC"
       - name: ncbi_virus_tax_id
         type: int
+        perSegment: true
+        oneHeader: true
         autocomplete: true
         customDisplay:
           type: link
@@ -1206,5 +1214,6 @@ runDevelopmentKeycloakDatabase: true
 runDevelopmentMainDatabase: true
 enforceHTTPS: true
 registrationTermsMessage: >
-   You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
-subdomainSeparator: "-"
+  You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
+
+subdomainSeparator: "-"
diff --git a/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx b/website/src/components/SequenceDetailsPage/DataTableEntryValue.tsx
@@ -15,7 +15,7 @@ const CustomDisplayComponent: React.FC<Props> = ({ data, dataUseTermsHistory })
 
     return (
         <div className='whitespace-normal text-gray-600 break-inside-avoid'>
-            <div className='whitespace-wrap'>
+            <div className='break-all whitespace-wrap'>
                 {!customDisplay && (value !== '' ? value : <span className='italic'>None</span>)}
                 {customDisplay?.type === 'badge' &&
                     (customDisplay.value === undefined ? (

diff --git a/website/src/components/SequenceDetailsPage/ReferenceSequenceLinkButton.tsx b/website/src/components/SequenceDetailsPage/ReferenceSequenceLinkButton.tsx
@@ -81,7 +81,7 @@ const ReferenceSequenceLinkButton: React.FC<Props> = ({ reference }) => {
                                                             currElement.insdc_accession_full !== undefined && (
                                                                 <div className='text-primary-700 ml-5 flex'>
                                                                     {isMultiSegmented && (
-                                                                        <div className='w-6 text-left mr-2'>
+                                                                        <div className='w-10 text-left mr-2'>
                                                                             {currElement.name}:
                                                                         </div>
                                                                     )}

diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts
@@ -43,6 +43,11 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
             result.topmatter.sequenceDisplayName = entry.value.toString();
             continue;
         }
+        const regex = new RegExp('^length');
+
+        if (entry.type.kind === 'metadata' && regex.test(entry.name) && entry.value === 0) {
+            continue;
+        }
 
         if (!tableHeaderMap.has(entry.header)) {
             tableHeaderMap.set(entry.header, []);