Skip to content

Commit cbfb6f7

Browse files
committed
Merge branch 'hotfix/1.1.0' into main
Fixed bug that added 1 to total fragment length, this will slightly increase the fpbm_br and fpbm_nbr values Added option to analyse long read data
2 parents 30a304d + 76809db commit cbfb6f7

File tree

7 files changed

+222
-36
lines changed

7 files changed

+222
-36
lines changed

CHANGES.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11

2+
## 1.1.0
3+
* Added option to analyse long read data
4+
* Fixed bug that added 1 to total fragment length, this will slightly increase the fpbm_br and fpbm_nbr values
5+
* added option to add header line to output
6+
27
## 1.0.0
38
* First release to calculate TA repeats FPBM values using samtools bedcoverage data
49

Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ USER root
33

44
55

6-
ENV ANALYSE_TA_VER '1.0.0'
6+
ENV ANALYSE_TA_VER '1.1.0'
77

88
# install system tools
99
RUN apt-get -yq update
@@ -40,7 +40,7 @@ RUN pip3 install --install-option="--prefix=$CGP_OPT/python-lib" dist/$(ls -1 di
4040
FROM ubuntu:20.04
4141

4242
LABEL uk.ac.sanger.cgp="Cancer Genome Project, Wellcome Sanger Institute" \
43-
version="1.0.0" \
43+
version="1.1.0" \
4444
description="Tool to perform TA repeat bed coverage analysis"
4545

4646
### security upgrades and cleanup

README.md

+8
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ Various exceptions can occur for malformed input files.
4949

5050
* ```test_sample 6.12 15.8``` command line output <sample_name> <mean_fpbm_broken> <mean_fpbm_non_broken>.
5151

52+
### outputFormat with dnovo flag set
53+
* ```Applicable only to Long Read sequencing data ```
54+
Use of dnovo flag will classify the TA repeat regions into broken and non-broken based on the user defined dnovo_cutoff parmater.
55+
This is applicable for long read data where average length of TA repeat is known for each interval
56+
* ```br:broken```
57+
* ```nbr:non_broken```
58+
* ```test_sample 6.12 15.8``` command line output <sample_name> <mean_fpbm_br> <mean_fpbm_nbr> <ref_br> <ref_nbr> <mean_fpbm_dnovo_br> <mean_fpbm_dnovo_br> <dnovo_br> <dnovo_nbr> <dnovo_in_ref_br> <dnovo_in_ref_nbr> <cumulative_fpbm_br> <cumulative_fpbm_nbr> <jaccard_br> <jaccard_nbr>.
59+
5260
## INSTALL
5361
Installing via `pip install`. Simply execute with the path to the compiled 'whl' found on the [release page][analyse_ta-releases]:
5462

analyse_ta/commandline.py

+74-13
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import argparse
55
import pkg_resources
6+
67
# load config and reference files....
78

89
version = pkg_resources.require("analyse_ta")[0].version
@@ -11,32 +12,92 @@
1112
def main(): # pragma: no cover <--
1213
usage = "\n %prog [options] -br input_br.bedcov -nbr input_nbr.bedcov -s <sample>"
1314

14-
optParser = argparse.ArgumentParser(prog='analyse_ta',
15-
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
15+
optParser = argparse.ArgumentParser(
16+
prog="analyse_ta", formatter_class=argparse.ArgumentDefaultsHelpFormatter
17+
)
1618
optional = optParser._action_groups.pop()
17-
required = optParser.add_argument_group('required arguments')
19+
required = optParser.add_argument_group("required arguments")
20+
21+
required.add_argument(
22+
"-br",
23+
"--file_br",
24+
type=str,
25+
dest="file_br",
26+
required=True,
27+
default=None,
28+
help="broken ta repeat bed coverage file",
29+
)
30+
required.add_argument(
31+
"-nbr",
32+
"--file_nbr",
33+
type=str,
34+
dest="file_nbr",
35+
required=True,
36+
default=None,
37+
help="non broken ta repeat bed coverage file",
38+
)
39+
40+
optional.add_argument(
41+
"-s",
42+
"--sample_name",
43+
type=str,
44+
dest="sample_name",
45+
required=False,
46+
default="test_sample",
47+
help="sample name",
48+
)
49+
50+
optional.add_argument(
51+
"-dn",
52+
"--dnovo",
53+
action="store_true",
54+
dest="dnovo",
55+
default=False,
56+
help="set flag to analyse dnovo long read data",
57+
)
58+
59+
optional.add_argument(
60+
"-ah",
61+
"--add_header",
62+
action="store_true",
63+
dest="add_header",
64+
default=False,
65+
help="set flag to add_header line, useful in batch mode to set for first sample",
66+
)
1867

19-
required.add_argument("-br", "--file_br", type=str, dest="file_br", required=True,
20-
default=None, help="broken ta repeat bed coverage file")
21-
required.add_argument("-nbr", "--file_nbr", type=str, dest="file_nbr", required=True,
22-
default=None, help="non broken ta repeat bed coverage file")
68+
optional.add_argument(
69+
"-dn_cutoff",
70+
"--dnovo_cutoff",
71+
type=int,
72+
dest="dnovo_cutoff",
73+
required=False,
74+
default=2,
75+
help="cut off length ratio with ref interval to flag as broken region",
76+
)
2377

24-
optional.add_argument("-s", "--sample_name", type=str, dest="sample_name", required=False,
25-
default='test_sample', help="sample name")
26-
optional.add_argument("-v", "--version", action='version', version='%(prog)s ' + version)
27-
optional.add_argument("-q", "--quiet", action="store_false", dest="verbose", required=False, default=True)
78+
optional.add_argument(
79+
"-v", "--version", action="version", version="%(prog)s " + version
80+
)
81+
optional.add_argument(
82+
"-q",
83+
"--quiet",
84+
action="store_false",
85+
dest="verbose",
86+
required=False,
87+
default=True,
88+
)
2889

2990
optParser._action_groups.append(optional)
3091
if len(sys.argv) == 0:
3192
optParser.print_help()
3293
sys.exit(1)
3394
opts = optParser.parse_args()
3495
if not opts.file_nbr or not opts.file_br:
35-
sys.exit('\nERROR Arguments required\n\tPlease run: analyse_ta.py --help\n')
96+
sys.exit("\nERROR Arguments required\n\tPlease run: analyse_ta.py --help\n")
3697
# vars function returns __dict__ of Namespace instance
3798
processed = processcov.processBedCov(**vars(opts))
3899
print(processed.results)
39100

40101

41-
if __name__ == '__main__':
102+
if __name__ == "__main__":
42103
main()

analyse_ta/process_bedcov.py

+116-19
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,141 @@
11
import os
22
import sys
33
import pandas as pd
4+
import numpy as np
45

56

6-
'''
7+
"""
78
This class claculates mean coverage for broken and non_broken TA reapeat depth output fron samtools bedcov
8-
'''
9+
"""
910

1011

1112
class processBedCov:
1213
"""
13-
Main class , loads user defined parameters and files
14+
Main class , loads user defined parameters and files
1415
"""
1516

1617
def __init__(self, **kwargs):
17-
self.br_file = kwargs['file_br']
18-
self.nbr_file = kwargs['file_nbr']
19-
self.sample = kwargs.get('sample_name', 'test_sample')
18+
self.br_file = kwargs["file_br"]
19+
self.nbr_file = kwargs["file_nbr"]
20+
self.dnovo = kwargs["dnovo"]
21+
self.add_header = kwargs["add_header"]
22+
self.dnovo_cutoff = kwargs["dnovo_cutoff"]
23+
self.sample = kwargs.get("sample_name", "test_sample")
2024
# check input data ...
2125
self.results = self.process()
2226

2327
def process(self):
24-
mydf_br = create_df_to_merge(self.br_file, 'br')
25-
mydf_nbr = create_df_to_merge(self.nbr_file, 'nbr')
28+
mydf_br = create_df_to_merge(self.br_file, "br", self.dnovo_cutoff, self.dnovo)
29+
mydf_nbr = create_df_to_merge(
30+
self.nbr_file, "nbr", self.dnovo_cutoff, self.dnovo
31+
)
2632
merged_df = pd.concat([mydf_nbr, mydf_br])
27-
mean_fpmb_const = merged_df['frl'].sum(axis=0)
28-
merged_df['fpbm'] = merged_df['frl'] / mean_fpmb_const * (10**6)
29-
br_mean = merged_df.loc[merged_df.ta_type == 'br']
30-
nbr_mean = merged_df.loc[merged_df.ta_type == 'nbr']
31-
return f"{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}\t{nbr_mean['fpbm'].mean(axis=0):.2f}"
33+
mean_fpmb_const = merged_df["frl"].sum(axis=0)
34+
merged_df["fpbm"] = merged_df["frl"] / mean_fpmb_const * (10**6)
35+
br_mean = merged_df.loc[merged_df.ta_type == "br"]
36+
nbr_mean = merged_df.loc[merged_df.ta_type == "nbr"]
37+
header = f"sample\tfpbm_br\tfpbm_nbr\n"
38+
output = ""
39+
if self.add_header:
40+
output = header
41+
if self.dnovo:
42+
if self.add_header:
43+
output = output.strip()
44+
output = (f"{output}\tref_br\tref_nbr\tmean_fpbm_dnovo_br\tmean_fpbm_dnovo_br\tdnovo_br\tdnovo_nbr"
45+
f"\tdnovo_in_ref_br\tdnovo_in_ref_nbr\tcumulative_fpbm_br"
46+
f"\tcumulative_fpbm_nbr\tjaccard_br\tjaccard_nbr\n")
47+
br_mean_dnovo = merged_df.loc[merged_df.ta_type_dnovo == "br"]
48+
nbr_mean_dnovo = merged_df.loc[merged_df.ta_type_dnovo == "nbr"]
49+
br_cumulative_mean = merged_df.loc[
50+
(merged_df["ta_type_dnovo"] == "br") | (merged_df["ta_type"] == "br")
51+
]
52+
nbr_cumulative_mean = merged_df.loc[
53+
(merged_df["ta_type_dnovo"] == "nbr") | (merged_df["ta_type"] == "nbr")
54+
]
55+
num_br = len(merged_df[merged_df["ta_type"] == "br"])
56+
num_nbr = len(merged_df[merged_df["ta_type"] == "nbr"])
57+
num_br_dnovo = len(merged_df[merged_df["ta_type_dnovo"] == "br"])
58+
num_nbr_dnovo = len(merged_df[merged_df["ta_type_dnovo"] == "nbr"])
59+
br_m11 = len(
60+
merged_df[
61+
(merged_df["ta_type_dnovo"] == "br") & (merged_df["ta_type"] == "br")
62+
]
63+
)
64+
br_m01 = len(
65+
merged_df[
66+
(merged_df["ta_type_dnovo"] != "br") & (merged_df["ta_type"] == "br")
67+
]
68+
)
69+
br_m10 = len(
70+
merged_df[
71+
(merged_df["ta_type_dnovo"] == "br") & (merged_df["ta_type"] != "br")
72+
]
73+
)
74+
nbr_m11 = len(
75+
merged_df[
76+
(merged_df["ta_type_dnovo"] == "nbr") & (merged_df["ta_type"] == "nbr")
77+
]
78+
)
79+
nbr_m01 = len(
80+
merged_df[
81+
(merged_df["ta_type_dnovo"] != "nbr") & (merged_df["ta_type"] == "nbr")
82+
]
83+
)
84+
nbr_m10 = len(
85+
merged_df[
86+
(merged_df["ta_type_dnovo"] == "nbr") & (merged_df["ta_type"] != "nbr")
87+
]
88+
)
3289

90+
jindex_br = (br_m11 / (br_m01 + br_m10 + br_m11)) * 100
91+
jindex_nbr = (nbr_m11 / (nbr_m01 + nbr_m10 + nbr_m11)) * 100
3392

34-
def create_df_to_merge(infile, ta_type):
93+
return (
94+
f"{output}{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}"
95+
f"\t{nbr_mean['fpbm'].mean(axis=0):.2f}\t{num_br}\t{num_nbr}"
96+
f"\t{br_mean_dnovo['fpbm'].mean(axis=0):.2f}\t{nbr_mean_dnovo['fpbm'].mean(axis=0):.2f}"
97+
f"\t{num_br_dnovo}\t{num_nbr_dnovo}\t{br_m11}\t{nbr_m11}"
98+
f"\t{br_cumulative_mean['fpbm'].mean(axis=0):.2f}"
99+
f"\t{nbr_cumulative_mean['fpbm'].mean(axis=0):.2f}\t{jindex_br:.2f}\t{jindex_nbr:.2f}"
100+
)
101+
102+
output = f"{output}{self.sample}\t{br_mean['fpbm'].mean(axis=0):.2f}\t{nbr_mean['fpbm'].mean(axis=0):.2f}"
103+
return output
104+
105+
106+
def create_df_to_merge(infile, ta_type, dnovo_cutoff, dnovo=None):
35107
"""
36-
create pandas data frame
108+
create pandas data frame
37109
"""
38110
if not os.path.isfile(infile):
111+
print(f"File not found {infile}")
39112
return None
40-
df = pd.read_csv(infile, compression='infer', sep="\t", low_memory=False,
41-
header=None, names=['chr', 'start', 'end', 'coverage'])
42-
df['frl'] = df['coverage'] / (df['end'] - df['start']) + 1
43-
df['ta_type'] = ta_type
113+
df = pd.read_csv(
114+
infile,
115+
compression="infer",
116+
sep="\t",
117+
low_memory=False,
118+
header=None,
119+
names=["chr", "start", "end", "coverage"],
120+
)
121+
122+
if dnovo:
123+
df["frl"] = (df["coverage"] * 2) / ((df["end"] - df["start"]) + 1)
124+
df["ta_type_dnovo"] = np.select(
125+
[df["frl"] <= dnovo_cutoff, df["frl"] > dnovo_cutoff], ["nbr", "br"]
126+
)
127+
df["ta_type"] = ta_type
128+
129+
else:
130+
df["frl"] = df["coverage"] / ((df["end"] - df["start"]) + 1)
131+
df["ta_type"] = ta_type
44132
return df
133+
134+
135+
def _print_df(mydf, out_file):
136+
if out_file:
137+
mydf.to_csv(
138+
out_file, sep="\t", mode="w", header=True, index=True, doublequote=False
139+
)
140+
else:
141+
sys.exit("Outfile not provided")

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from setuptools import setup
44

55
config = {
6-
'version': '1.0.0',
6+
'version': '1.1.0',
77
'name': 'analyse_ta',
88
'description': 'Tool to analyse TA repeats bed coverage...',
99
'author': 'Shriram Bhosle',

test/test_ta.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,26 @@ class TestClass():
1313
test_dir = configdir + '/data/'
1414
options = {'file_br': test_dir + 'test_br.bedcov',
1515
'file_nbr': test_dir + 'test_nbr.bedcov',
16+
'dnovo':False,
17+
'add_header':True,
18+
'dnovo_cutoff':2,
19+
'sample_name': 'my_test_sample',
20+
}
21+
options_ta_ext = {'file_br': test_dir + 'test_br.bedcov',
22+
'file_nbr': test_dir + 'test_nbr.bedcov',
23+
'dnovo':True,
24+
'add_header':False,
25+
'dnovo_cutoff':10,
1626
'sample_name': 'my_test_sample',
1727
}
1828
processed=processcov.processBedCov(**options)
29+
processed_ta_ext=processcov.processBedCov(**options_ta_ext)
1930
# celline output
2031
def test_bedcov(self):
2132
f=self.processed
22-
assert f.results == "my_test_sample\t35.23\t80.85"
33+
assert f.results == "sample\tfpbm_br\tfpbm_nbr\nmy_test_sample\t33.04\t82.05"
34+
35+
def test_ta_ext(self):
36+
f_ext=self.processed_ta_ext
37+
assert f_ext.results == "my_test_sample\t33.04\t82.05\t5434\t10000\t79.39\t14.36\t11970\t3464\t2776\t806\t67.53\t67.78\t18.98\t6.37"
2338

0 commit comments

Comments
 (0)