From 2bb74bfc26dee80af7d04089aafe636ea7c6d76a Mon Sep 17 00:00:00 2001 From: Alexander Thomas <77535027+alethomas@users.noreply.github.com> Date: Wed, 15 Nov 2023 14:43:15 +0100 Subject: [PATCH] fix: QA genome generation (#613) * fix deprecated squeeze * fix wrong seq name * fmt * remove not needed file handling --- workflow/scripts/masking.py | 8 ++------ workflow/scripts/quality-filter.py | 8 +++----- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/workflow/scripts/masking.py b/workflow/scripts/masking.py index 95fdb9d75..e02d46009 100644 --- a/workflow/scripts/masking.py +++ b/workflow/scripts/masking.py @@ -48,7 +48,6 @@ def get_base_count(pileupcolumn): for pileupread in pileupcolumn.pileups: # TODO Check pileupread for missing bases if not pileupread.is_del and not pileupread.is_refskip: - read_base = pileupread.alignment.query_sequence[pileupread.query_position] bases.append(read_base) @@ -102,7 +101,6 @@ def mask_sequence(sequence, coverages, base_counts): covered_postions = coverages.keys() for position, base in enumerate(sequence): - if position not in covered_postions: # TODO Check why there are postions that are not covered by any reads and are not Ns # sequence[position] = "N" @@ -160,10 +158,8 @@ def mask_sequence(sequence, coverages, base_counts): def write_sequence(sequence): - with pysam.FastxFile(snakemake.input.sequence) as infile, open( - snakemake.output.masked_sequence, mode="w" - ) as outfile: - print(">%s" % next(infile).name.split(".")[0], file=outfile) + with open(snakemake.output.masked_sequence, mode="w") as outfile: + print(">%s" % snakemake.wildcards.sample, file=outfile) print(sequence, file=outfile) diff --git a/workflow/scripts/quality-filter.py b/workflow/scripts/quality-filter.py index 7cc41b50c..68977de25 100644 --- a/workflow/scripts/quality-filter.py +++ b/workflow/scripts/quality-filter.py @@ -31,13 +31,11 @@ def get_identity(quast_report_paths: List[str]) -> dict: sample = path.dirname(report_path).split("/")[-1] # load report - report_df = pd.read_csv( - report_path, delimiter="\t", index_col=0, squeeze=True, names=["value"] - ) - + report_df = pd.read_csv(report_path, delimiter="\t", names=["name", "value"]) + report_df.set_index("name", inplace=True) # select genome fraction (%) try: - fraction = float(report_df.at["Genome fraction (%)"]) / 100 + fraction = float(report_df.at["Genome fraction (%)", "value"]) / 100 except: # no "Genome fraction (%)" in quast report. Case for not assemblable samples fraction = 0.0