From 8c34cda01b22077511000d4fbccb64c14025ed2a Mon Sep 17 00:00:00 2001 From: Remi-Andre Olsen Date: Mon, 20 Nov 2023 13:08:37 +0100 Subject: [PATCH] Report to dataframe csv file --- anglerfish/anglerfish.py | 1 + anglerfish/demux/report.py | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 2987912..6184fe5 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -127,6 +127,7 @@ def run_demux(args): report.write_report(args.out_fastq) report.write_json(args.out_fastq) + report.write_dataframe(args.out_fastq, ss) if args.skip_fastqc: log.warning(" As of version 0.4.1, built in support for FastQC + MultiQC is removed. The '-f' flag is redundant.") diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index 458b86b..78f471b 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -1,6 +1,6 @@ import os import json -from dataclasses import dataclass +from dataclasses import dataclass, asdict from typing import ClassVar class Report(object): @@ -58,6 +58,39 @@ def write_json(self, outdir): with open(os.path.join(outdir,"anglerfish_stats.json"), "w") as f: f.write(json.dumps(json_out,indent=2, sort_keys=True)) + def write_dataframe(self,outdir,samplesheet): + """Write a dataframe of the stats to a csv file. + TODO: This needs be cleaned up and made more robust. Especially lock in / decouple from upstream the header names and order: + sample_name, num_reads, mean_read_len, std_read_len, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index + """ + out_list = [] + for sample in self.sample_stats: + s_dict = asdict(sample) + for sentry in samplesheet: + sen_dict = asdict(sentry) + if sen_dict["sample_name"] == s_dict["sample_name"]: + s_dict["adaptor_name"] = sen_dict["adaptor"].name + s_dict["i7_index"] = sen_dict["adaptor"].i7_index + s_dict["i5_index"] = sen_dict["adaptor"].i5_index + out_list.append(s_dict) + for key, unmatch in self.unmatched_stats.items(): + un = {i: None for i in out_list[-1].keys()} + i7i5 = [i.upper() for i in unmatch[0][0].split("+")] + if len(i7i5) == 1: + i7i5.append(None) + un["adaptor_name"] = key[1] + un["num_reads"] = unmatch[0][1] + un["ont_barcode"] = key[0] + un["i7_index"] = i7i5[0] + un["i5_index"] = i7i5[1] + out_list.append(un) + with open(os.path.join(outdir,"anglerfish_dataframe.csv"), "w") as f: + out_header = out_list[0].keys() + f.write(",".join(out_header)) + f.write("\n") + for out in out_list: + f.write(",".join([str(out[i]) for i in out_header])) + f.write("\n") class AlignmentStat(object): @@ -85,7 +118,7 @@ class SampleStat: i5_reversed: bool ont_barcode: str = None header: ClassVar[list] = ["sample_name", - "#reads", + "#reads", # We specify this for historical reasons "mean_read_len", "std_read_len", "i5_reversed",