|
1 | 1 | import os
|
2 | 2 | import sys
|
3 | 3 | import pandas as pd
|
| 4 | +import numpy as np |
4 | 5 |
|
5 | 6 |
|
6 |
| -''' |
| 7 | +""" |
7 | 8 | This class claculates mean coverage for broken and non_broken TA reapeat depth output fron samtools bedcov
|
8 |
| -''' |
| 9 | +""" |
9 | 10 |
|
10 | 11 |
|
11 | 12 | class processBedCov:
|
12 | 13 | """
|
13 |
| - Main class , loads user defined parameters and files |
| 14 | + Main class , loads user defined parameters and files |
14 | 15 | """
|
15 | 16 |
|
16 | 17 | def __init__(self, **kwargs):
|
17 |
| - self.br_file = kwargs['file_br'] |
18 |
| - self.nbr_file = kwargs['file_nbr'] |
19 |
| - self.sample = kwargs.get('sample_name', 'test_sample') |
| 18 | + self.br_file = kwargs["file_br"] |
| 19 | + self.nbr_file = kwargs["file_nbr"] |
| 20 | + self.dnovo = kwargs["dnovo"] |
| 21 | + self.add_header = kwargs["add_header"] |
| 22 | + self.dnovo_cutoff = kwargs["dnovo_cutoff"] |
| 23 | + self.sample = kwargs.get("sample_name", "test_sample") |
20 | 24 | # check input data ...
|
21 | 25 | self.results = self.process()
|
22 | 26 |
|
23 | 27 | def process(self):
|
24 |
| - mydf_br = create_df_to_merge(self.br_file, 'br') |
25 |
| - mydf_nbr = create_df_to_merge(self.nbr_file, 'nbr') |
| 28 | + mydf_br = create_df_to_merge(self.br_file, "br", self.dnovo_cutoff, self.dnovo) |
| 29 | + mydf_nbr = create_df_to_merge( |
| 30 | + self.nbr_file, "nbr", self.dnovo_cutoff, self.dnovo |
| 31 | + ) |
26 | 32 | merged_df = pd.concat([mydf_nbr, mydf_br])
|
27 |
| - mean_fpmb_const = merged_df['frl'].sum(axis=0) |
28 |
| - merged_df['fpbm'] = merged_df['frl'] / mean_fpmb_const * (10**6) |
29 |
| - br_mean = merged_df.loc[merged_df.ta_type == 'br'] |
30 |
| - nbr_mean = merged_df.loc[merged_df.ta_type == 'nbr'] |
31 |
| - return f"{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}\t{nbr_mean['fpbm'].mean(axis=0):.2f}" |
| 33 | + mean_fpmb_const = merged_df["frl"].sum(axis=0) |
| 34 | + merged_df["fpbm"] = merged_df["frl"] / mean_fpmb_const * (10**6) |
| 35 | + br_mean = merged_df.loc[merged_df.ta_type == "br"] |
| 36 | + nbr_mean = merged_df.loc[merged_df.ta_type == "nbr"] |
| 37 | + header = f"sample\tfpbm_br\tfpbm_nbr\n" |
| 38 | + output = "" |
| 39 | + if self.add_header: |
| 40 | + output = header |
| 41 | + if self.dnovo: |
| 42 | + if self.add_header: |
| 43 | + output = output.strip() |
| 44 | + output = (f"{output}\tref_br\tref_nbr\tmean_fpbm_dnovo_br\tmean_fpbm_dnovo_br\tdnovo_br\tdnovo_nbr" |
| 45 | + f"\tdnovo_in_ref_br\tdnovo_in_ref_nbr\tcumulative_fpbm_br" |
| 46 | + f"\tcumulative_fpbm_nbr\tjaccard_br\tjaccard_nbr\n") |
| 47 | + br_mean_dnovo = merged_df.loc[merged_df.ta_type_dnovo == "br"] |
| 48 | + nbr_mean_dnovo = merged_df.loc[merged_df.ta_type_dnovo == "nbr"] |
| 49 | + br_cumulative_mean = merged_df.loc[ |
| 50 | + (merged_df["ta_type_dnovo"] == "br") | (merged_df["ta_type"] == "br") |
| 51 | + ] |
| 52 | + nbr_cumulative_mean = merged_df.loc[ |
| 53 | + (merged_df["ta_type_dnovo"] == "nbr") | (merged_df["ta_type"] == "nbr") |
| 54 | + ] |
| 55 | + num_br = len(merged_df[merged_df["ta_type"] == "br"]) |
| 56 | + num_nbr = len(merged_df[merged_df["ta_type"] == "nbr"]) |
| 57 | + num_br_dnovo = len(merged_df[merged_df["ta_type_dnovo"] == "br"]) |
| 58 | + num_nbr_dnovo = len(merged_df[merged_df["ta_type_dnovo"] == "nbr"]) |
| 59 | + br_m11 = len( |
| 60 | + merged_df[ |
| 61 | + (merged_df["ta_type_dnovo"] == "br") & (merged_df["ta_type"] == "br") |
| 62 | + ] |
| 63 | + ) |
| 64 | + br_m01 = len( |
| 65 | + merged_df[ |
| 66 | + (merged_df["ta_type_dnovo"] != "br") & (merged_df["ta_type"] == "br") |
| 67 | + ] |
| 68 | + ) |
| 69 | + br_m10 = len( |
| 70 | + merged_df[ |
| 71 | + (merged_df["ta_type_dnovo"] == "br") & (merged_df["ta_type"] != "br") |
| 72 | + ] |
| 73 | + ) |
| 74 | + nbr_m11 = len( |
| 75 | + merged_df[ |
| 76 | + (merged_df["ta_type_dnovo"] == "nbr") & (merged_df["ta_type"] == "nbr") |
| 77 | + ] |
| 78 | + ) |
| 79 | + nbr_m01 = len( |
| 80 | + merged_df[ |
| 81 | + (merged_df["ta_type_dnovo"] != "nbr") & (merged_df["ta_type"] == "nbr") |
| 82 | + ] |
| 83 | + ) |
| 84 | + nbr_m10 = len( |
| 85 | + merged_df[ |
| 86 | + (merged_df["ta_type_dnovo"] == "nbr") & (merged_df["ta_type"] != "nbr") |
| 87 | + ] |
| 88 | + ) |
32 | 89 |
|
| 90 | + jindex_br = (br_m11 / (br_m01 + br_m10 + br_m11)) * 100 |
| 91 | + jindex_nbr = (nbr_m11 / (nbr_m01 + nbr_m10 + nbr_m11)) * 100 |
33 | 92 |
|
34 |
| -def create_df_to_merge(infile, ta_type): |
| 93 | + return ( |
| 94 | + f"{output}{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}" |
| 95 | + f"\t{nbr_mean['fpbm'].mean(axis=0):.2f}\t{num_br}\t{num_nbr}" |
| 96 | + f"\t{br_mean_dnovo['fpbm'].mean(axis=0):.2f}\t{nbr_mean_dnovo['fpbm'].mean(axis=0):.2f}" |
| 97 | + f"\t{num_br_dnovo}\t{num_nbr_dnovo}\t{br_m11}\t{nbr_m11}" |
| 98 | + f"\t{br_cumulative_mean['fpbm'].mean(axis=0):.2f}" |
| 99 | + f"\t{nbr_cumulative_mean['fpbm'].mean(axis=0):.2f}\t{jindex_br:.2f}\t{jindex_nbr:.2f}" |
| 100 | + ) |
| 101 | + |
| 102 | + output = f"{output}{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}\t{nbr_mean['fpbm'].mean(axis=0):.2f}" |
| 103 | + return output |
| 104 | + |
| 105 | + |
| 106 | +def create_df_to_merge(infile, ta_type, dnovo_cutoff, dnovo=None): |
35 | 107 | """
|
36 |
| - create pandas data frame |
| 108 | + create pandas data frame |
37 | 109 | """
|
38 | 110 | if not os.path.isfile(infile):
|
| 111 | + print(f"File not found {infile}") |
39 | 112 | return None
|
40 |
| - df = pd.read_csv(infile, compression='infer', sep="\t", low_memory=False, |
41 |
| - header=None, names=['chr', 'start', 'end', 'coverage']) |
42 |
| - df['frl'] = df['coverage'] / (df['end'] - df['start']) + 1 |
43 |
| - df['ta_type'] = ta_type |
| 113 | + df = pd.read_csv( |
| 114 | + infile, |
| 115 | + compression="infer", |
| 116 | + sep="\t", |
| 117 | + low_memory=False, |
| 118 | + header=None, |
| 119 | + names=["chr", "start", "end", "coverage"], |
| 120 | + ) |
| 121 | + |
| 122 | + if dnovo: |
| 123 | + df["frl"] = (df["coverage"] * 2) / ((df["end"] - df["start"]) + 1) |
| 124 | + df["ta_type_dnovo"] = np.select( |
| 125 | + [df["frl"] <= dnovo_cutoff, df["frl"] > dnovo_cutoff], ["nbr", "br"] |
| 126 | + ) |
| 127 | + df["ta_type"] = ta_type |
| 128 | + |
| 129 | + else: |
| 130 | + df["frl"] = df["coverage"] / ((df["end"] - df["start"]) + 1) |
| 131 | + df["ta_type"] = ta_type |
44 | 132 | return df
|
| 133 | + |
| 134 | + |
| 135 | +def _print_df(mydf, out_file): |
| 136 | + if out_file: |
| 137 | + mydf.to_csv( |
| 138 | + out_file, sep="\t", mode="w", header=True, index=True, doublequote=False |
| 139 | + ) |
| 140 | + else: |
| 141 | + sys.exit("Outfile not provided") |
0 commit comments