Skip to content

Commit

Permalink
fix(ingest): Remove tsv-utils so that image can build on main (#2185)
Browse files Browse the repository at this point in the history
* Replace tsv-utils with script until tsv-utils is available for aarch64.
  • Loading branch information
anna-parker authored Jun 26, 2024
1 parent 396ebe7 commit f076cc1
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 5 deletions.
11 changes: 7 additions & 4 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -148,18 +148,21 @@ rule align:

rule process_alignments:
input:
script="scripts/process_alignments.py",
results=expand(
"results/nextclade_{segment}.tsv",
segment=config["nucleotide_sequences"],
),
output:
merged="results/nextclade_merged.tsv",
params:
log_level=LOG_LEVEL,
shell:
"""
tsv-append --header {input.results} \
| tsv-select --header --fields seqName,clade \
| tsv-filter --header --not-empty clade \
> {output.merged}
python {input.script} \
--input "{input.results}" \
--output {output.merged} \
--log-level {params.log_level} \
"""


Expand Down
1 change: 0 additions & 1 deletion ingest/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,4 @@ dependencies:
- requests
- seqkit
- snakemake
- tsv-utils
- unzip
66 changes: 66 additions & 0 deletions ingest/scripts/process_alignments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import csv
import os
import pandas as pd
import logging
import sys

import click


logger = logging.getLogger(__name__)
logging.basicConfig(
encoding="utf-8",
level=logging.DEBUG,
format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
datefmt="%H:%M:%S",
)

# https://stackoverflow.com/questions/15063936
csv.field_size_limit(sys.maxsize)


def validate_paths(ctx, param, value):
"""Custom validation function to check if all provided paths exist."""
paths = value.split(" ")
for path in paths:
if not os.path.exists(path):
msg = f"Path does not exist: {path}"
raise click.BadParameter(msg)
return paths


@click.command()
@click.option(
"--input",
required=True,
callback=validate_paths,
help="List of paths to alignment files.",
)
@click.option("--output", required=True, type=click.Path())
@click.option(
"--log-level",
default="INFO",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
)
def main(
input: str,
output: str,
log_level: str,
) -> None:
logger.setLevel(log_level)

appended_df = pd.DataFrame({"seqName": [], "clade": []})

for alignment_path in input:
df = pd.read_csv(alignment_path, sep="\t", dtype=str)
seq_clade = df[["seqName", "clade"]]
# drop all rows that do not contain a clade - i.e. did not align to a segment
seq_clade = seq_clade.dropna(subset=["clade"])
appended_df = appended_df._append(seq_clade, ignore_index=True)

# saving as tsv file
appended_df.to_csv(output, sep="\t", index=False)


if __name__ == "__main__":
main()

0 comments on commit f076cc1

Please sign in to comment.