Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest): Small ingest fixes for multi-segmented viruses #2316

Merged
merged 10 commits into from
Jul 23, 2024
11 changes: 7 additions & 4 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -150,18 +150,21 @@ rule align:

rule process_alignments:
input:
script="scripts/process_alignments.py",
results=expand(
"results/nextclade_{segment}.tsv",
segment=config["nucleotide_sequences"],
),
output:
merged="results/nextclade_merged.tsv",
params:
log_level=LOG_LEVEL,
shell:
"""
tsv-append --header {input.results} \
| tsv-select --header --fields seqName,clade \
| tsv-filter --header --not-empty clade \
> {output.merged}
python {input.script} \
anna-parker marked this conversation as resolved.
Show resolved Hide resolved
--input "{input.results}" \
--output {output.merged} \
--log-level {params.log_level} \
"""


Expand Down
5 changes: 1 addition & 4 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ rename:
ncbi_sra_accessions: sra_run_accession
ncbi_submitter_affiliation: author_affiliations
ncbi_submitter_names: authors

# Fields that are shared at sample level
# Used to deduplicate samples
# Used for segmented viruses only
Expand All @@ -56,18 +55,16 @@ shared_fields:
- authors
- ncbi_release_date #TODO (#2171): Allow segments to have different dates
- ncbi_update_date

# Fields that that are not shared at sample level
# But specific to each segment
segment_specific_fields:
- biosample_accession # Should usually be same for each segment
- bioproject_accessions # Should usually be same for each segment?
- sra_run_accession # Usually the same for each segment?
- ncbi_protein_count
- insdc_accession_base
- insdc_version
- insdc_accession_full
- hash
- ncbi_update_date
all_fields:
- accession
- bioprojects
Expand Down
3 changes: 3 additions & 0 deletions ingest/scripts/group_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,13 @@ def main(
logging.info(f"Wrote grouped metadata for {len(metadata)} sequences")

count = 0
count_ignored = 0
for record in orjsonl.stream(input_seq):
accession = record["id"]
raw_sequence = record["sequence"]
if accession not in fasta_id_map:
logger.warning(f"Accession {accession} not found in input sequence file, skipping")
count_ignored += 1
continue
orjsonl.append(
output_seq,
Expand All @@ -250,6 +252,7 @@ def main(
)
count += 1
logging.info(f"Wrote {count} sequences")
logging.info(f"Ignored {count_ignored} sequences as not found in {input_seq}")
anna-parker marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == "__main__":
Expand Down
70 changes: 70 additions & 0 deletions ingest/scripts/process_alignments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import csv
import os
import pandas as pd
import logging
import sys

import click


logger = logging.getLogger(__name__)
logging.basicConfig(
encoding="utf-8",
level=logging.DEBUG,
format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
datefmt="%H:%M:%S",
)

# https://stackoverflow.com/questions/15063936
csv.field_size_limit(sys.maxsize)


def validate_paths(ctx, param, value):
"""Custom validation function to check if all provided paths exist."""
paths = value.split(" ")
for path in paths:
if not os.path.exists(path):
msg = f"Path does not exist: {path}"
raise click.BadParameter(msg)
return paths


@click.command()
@click.option(
"--input",
required=True,
callback=validate_paths,
help="List of paths to alignment files.",
)
@click.option("--output", required=True, type=click.Path())
@click.option(
"--log-level",
default="INFO",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
)
def main(
input: str,
output: str,
log_level: str,
) -> None:
logger.setLevel(log_level)

appended_df = pd.DataFrame({"seqName": [], "clade": []})

for alignment_path in input:
df = pd.read_csv(alignment_path, sep="\t", dtype=str)
seq_clade = df[["seqName", "qc.overallStatus"]]
# drop all rows that do not contain a qc.overallStatus - i.e. did not align to a segment
seq_clade = seq_clade.dropna(subset=["qc.overallStatus"])
segment_name = (alignment_path.split(".")[-2]).split("_")[-1]
seq_clade_named = seq_clade[["seqName"]]
seq_clade_named["clade"] = segment_name
appended_df = appended_df._append(seq_clade_named, ignore_index=True)

# saving as tsv file
appended_df.to_csv(output, sep="\t", index=False)
logging.info(f"Kept {len(appended_df.index)} sequences where segment assignment was possible.")


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions kubernetes/loculus/templates/_common-metadata.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,11 @@ fields:
{{- if .displayName }}
displayName: {{ printf "%s %s" .displayName $segment | quote }}
{{- end }}
{{- if (default false .oneHeader)}}
header: {{ (default "Other" .header) | quote }}
{{- else }}
header: {{ printf "%s %s" (default "Other" .header) $segment | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- else }}
Expand Down
11 changes: 9 additions & 2 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ defaultOrganismConfig: &defaultOrganismConfig
inputs:
timestamp: ncbi_update_date
noInput: true
perSegment: true
oneHeader: true
- name: geo_loc_country
displayName: Collection country
generateIndex: true
Expand Down Expand Up @@ -153,12 +155,14 @@ defaultOrganismConfig: &defaultOrganismConfig
hideOnSequenceDetailsPage: true
noInput: true
perSegment: true
oneHeader: true
- name: insdc_version
type: int
header: "INSDC"
hideOnSequenceDetailsPage: true
noInput: true
perSegment: true
oneHeader: true
- name: insdc_accession_full
displayName: INSDC accession
customDisplay:
Expand All @@ -168,6 +172,7 @@ defaultOrganismConfig: &defaultOrganismConfig
ingest: genbank_accession
noInput: true
perSegment: true
oneHeader: true
- name: bioproject_accessions
customDisplay:
type: link
Expand All @@ -183,6 +188,7 @@ defaultOrganismConfig: &defaultOrganismConfig
header: "INSDC"
noInput: true
perSegment: true
oneHeader: true
- name: culture_id
displayName: Culture ID
header: Sample details
Expand Down Expand Up @@ -1206,5 +1212,6 @@ runDevelopmentKeycloakDatabase: true
runDevelopmentMainDatabase: true
enforceHTTPS: true
registrationTermsMessage: >
You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
subdomainSeparator: "-"
You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.

subdomainSeparator: "-"
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ const CustomDisplayComponent: React.FC<Props> = ({ data, dataUseTermsHistory })

return (
<div className='whitespace-normal text-gray-600 break-inside-avoid'>
<div className='whitespace-wrap'>
<div className='break-all whitespace-wrap'>
{!customDisplay && (value !== '' ? value : <span className='italic'>None</span>)}
{customDisplay?.type === 'badge' &&
(customDisplay.value === undefined ? (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ const ReferenceSequenceLinkButton: React.FC<Props> = ({ reference }) => {
currElement.insdc_accession_full !== undefined && (
<div className='text-primary-700 ml-5 flex'>
{isMultiSegmented && (
<div className='w-6 text-left mr-2'>
<div className='w-10 text-left mr-2'>
{currElement.name}:
</div>
)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
result.topmatter.sequenceDisplayName = entry.value.toString();
continue;
}
const regex = new RegExp('^length');

if (entry.type.kind === 'metadata' && regex.test(entry.name) && entry.value === 0) {
continue;
}

if (!tableHeaderMap.has(entry.header)) {
tableHeaderMap.set(entry.header, []);
Expand Down
Loading