From 8c34cda01b22077511000d4fbccb64c14025ed2a Mon Sep 17 00:00:00 2001
From: Remi-Andre Olsen <remi-andre.olsen@scilifelab.se>
Date: Mon, 20 Nov 2023 13:08:37 +0100
Subject: [PATCH] Report to dataframe csv file

---
 anglerfish/anglerfish.py   |  1 +
 anglerfish/demux/report.py | 37 +++++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py
index 2987912..6184fe5 100755
--- a/anglerfish/anglerfish.py
+++ b/anglerfish/anglerfish.py
@@ -127,6 +127,7 @@ def run_demux(args):
 
     report.write_report(args.out_fastq)
     report.write_json(args.out_fastq)
+    report.write_dataframe(args.out_fastq, ss)
 
     if args.skip_fastqc:
         log.warning(" As of version 0.4.1, built in support for FastQC + MultiQC is removed. The '-f' flag is redundant.")
diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py
index 458b86b..78f471b 100644
--- a/anglerfish/demux/report.py
+++ b/anglerfish/demux/report.py
@@ -1,6 +1,6 @@
 import os
 import json
-from dataclasses import dataclass
+from dataclasses import dataclass, asdict
 from typing import ClassVar
 
 class Report(object):
@@ -58,6 +58,39 @@ def write_json(self, outdir):
         with open(os.path.join(outdir,"anglerfish_stats.json"), "w") as f:
             f.write(json.dumps(json_out,indent=2, sort_keys=True))
 
+    def write_dataframe(self,outdir,samplesheet):
+        """Write a dataframe of the stats to a csv file.
+            TODO: This needs be cleaned up and made more robust. Especially lock in / decouple from upstream the header names and order:
+            sample_name, num_reads, mean_read_len, std_read_len, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index 
+        """
+        out_list = []
+        for sample in self.sample_stats:
+            s_dict = asdict(sample)
+            for sentry in samplesheet:
+                sen_dict = asdict(sentry)
+                if sen_dict["sample_name"] == s_dict["sample_name"]:
+                    s_dict["adaptor_name"] = sen_dict["adaptor"].name
+                    s_dict["i7_index"] = sen_dict["adaptor"].i7_index
+                    s_dict["i5_index"] = sen_dict["adaptor"].i5_index
+            out_list.append(s_dict)
+        for key, unmatch in self.unmatched_stats.items():
+            un = {i: None for i in out_list[-1].keys()}
+            i7i5 = [i.upper() for i in unmatch[0][0].split("+")]
+            if len(i7i5) == 1:
+                i7i5.append(None)
+            un["adaptor_name"] = key[1]
+            un["num_reads"] = unmatch[0][1]
+            un["ont_barcode"] = key[0]
+            un["i7_index"] = i7i5[0]
+            un["i5_index"] = i7i5[1]
+            out_list.append(un)
+        with open(os.path.join(outdir,"anglerfish_dataframe.csv"), "w") as f:
+            out_header = out_list[0].keys()
+            f.write(",".join(out_header))
+            f.write("\n")
+            for out in out_list:
+                f.write(",".join([str(out[i]) for i in out_header]))
+                f.write("\n")
 
 class AlignmentStat(object):
 
@@ -85,7 +118,7 @@ class SampleStat:
     i5_reversed: bool
     ont_barcode: str = None
     header: ClassVar[list] = ["sample_name",
-                              "#reads",
+                              "#reads", # We specify this for historical reasons
                               "mean_read_len",
                               "std_read_len",
                               "i5_reversed",