forked from melanieabrams/bremdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path6filter_inserts_uncollapsedMBA_debug.py
126 lines (102 loc) · 6.05 KB
/
6filter_inserts_uncollapsedMBA_debug.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
from sys import argv
import sys
import numpy as np
### USAGE ###
# python filter_inserts.py output_folder file_location/*.normalized_averaged_bioreps
# combines data from all bioreps into a single dataframe, filters inserts based on variables below
# can decide to filter based on coefficient of variation between bioreps, on the final average read count, or on both
# equal to cutoff value passes
# will generate a new file with filtering info in the title, to keep track of which file had which filtering scheme (coeffvar#_read#_...)
# keeps the insert if passes filtering metric in either 39C or 28C, doesn't need to pass in both
# when insert is found in one temperature but not the other, substitute a 1
# also saves a file before filtering with all the coding inserts "all_data.unfiltered_inserts_coding"
# if you want to try multiple filtering schemes, this is the point where you will have to run through the rest of the analysis separately for every separate filtered file
#fixed 1/28/2021 so numbers converted to floats for filtering for correct boolean comparisons
coeff_var_cutoff = 2.0 #cutoff for coefficient of variation
norm_reads_cutoff = 1.1 # cutoff for normalized read count
filter_strategy = 'both' # can be 'coeff' or 'reads' or 'both'
def parse_file(filename, sep='\t'):
df = pd.read_csv(filename, sep=sep)
return df
global test
test = 0.0
def fillna_and_combine(grp):
# keeps track of how many inserts have gone through, prints out so you can see how long the script is taking to run
global test
test += 1
if test % 10000 == 0:
print test
grp['ID'] = grp.index
filled = grp.fillna(method='ffill').fillna(method='bfill')
return filled.iloc[0]
def make_and_populate_new_columns(grp):
condition = grp.ix[0, 'condition']
if condition != 'T0':
column_name = condition+'_br_averaged_reads'
grp[column_name] = grp['br_averaged_reads']
column_name2 = condition+'_br_coeffvar'
grp[column_name2] = grp['br_coeffvar']
return grp
if __name__ == '__main__':
output_folder = argv[1].strip('/')
files = argv[2:]
file_dict = {}
df_list = []
for each_file in files:
df = parse_file(each_file)
df.set_index('ID', inplace=True, drop=False)
condition_ID = df['condition'][1]
if condition_ID == 'T0':
df.drop('bio_rep_ID', axis=1, inplace=True)
file_dict[condition_ID] = df
df_list.append(df)
concatd_df = pd.concat(df_list, axis=0)
all_columns = list(concatd_df.columns.values)
drop_columns = [col for col in all_columns if '_norm_' in col]
#drop_columns = [col for col in all_columns if '_n_' in col] ##CHANGED HERE SO DOESN'T DROP INDIVIDUAL BIOREPS
drop_columns.extend([col for col in all_columns if '_tr_' in col]) ##STILL DROPS TECH REPS
concatd_df.drop(drop_columns, axis=1, inplace=True)
concatd_df.condition = concatd_df.condition.astype(str)
concatd_df.set_index('condition', inplace=True, drop=False)
concatd_df.index.name = None #added this line for py2.7 from 2.6
grouped2 = concatd_df.groupby('condition', sort=False).apply(make_and_populate_new_columns)
grouped2.set_index('ID', inplace=True, drop=False)
grouped2.drop(['br_averaged_reads', 'br_coeffvar', 'condition'], axis=1, inplace=True)
grouped2['NEWID'] = grouped2['ID']
grouped_tmp = grouped2.groupby('NEWID', sort=False)
group3 = grouped_tmp.apply(fillna_and_combine)
columns = list(group3.columns.values)
to_fill_with_one = [col for col in columns if 'reads' in col]
group3[to_fill_with_one] = group3[to_fill_with_one].fillna(value=1.0) # leave empty "coeffvar" cells as NaN
group3.to_csv(str(output_folder)+str('/')+'all_data.unfiltered_inserts_coding', sep='\t', index=False)
group3.replace(to_replace='None', value=0.0, inplace=True) #replaces ''None' with 0.0 again so the filtering works
if filter_strategy == 'coeff':
columns_to_filter_by = [col for col in columns if 'br_coeffvar' in col] # filter by either of the "conditions" coeffvar
# if br coeffvar of 39 or 28 is lower than or equal to cutoff, keep
filtered = group3[(np.absolute(group3[columns_to_filter_by[0]]) <= coeff_var_cutoff) | (np.absolute(group3[columns_to_filter_by[1]]) <= coeff_var_cutoff)] ##changed this to ABSOLUTE VAL
#will ignore NaN, it won't count either way and will never evaluate true
print len(group3), 'before filtering coeff'
print len(filtered), 'after filtering coeff'
filtered.to_csv(str(output_folder)+str('/')+str(coeff_var_cutoff)+'_coeffvarcutoff.filtered_inserts', sep='\t', index=False)
if filter_strategy == 'reads':
columns_to_filter_by = [col for col in columns if 'br_averaged_reads' in col]
# if norm averaged reads in 39 or 28 is higher than or equal to read cutoff, keep
filtered = group3[(group3[columns_to_filter_by[0]] >= norm_reads_cutoff) | (group3[columns_to_filter_by[1]] >= norm_reads_cutoff)]
print len(group3), 'before filtering reads'
print len(filtered), 'after filtering reads'
filtered.to_csv(str(output_folder)+str('/')+str(norm_reads_cutoff)+'_readscutoff.filtered_inserts', sep='\t', index=False)
if filter_strategy == 'both':
all_filt_col_float_dict={} #make sure all the columns for boolean comparisons are float type, or it doesn't work
columns_to_filter_by = [col for col in columns if 'br_coeffvar' in col]
columns_to_filter_by2 = [col for col in columns if 'br_averaged_reads' in col]
all_columns_to_filter_by=columns_to_filter_by+columns_to_filter_by2
for column in all_columns_to_filter_by:
all_filt_col_float_dict[column]='float'
group3=group3.astype(all_filt_col_float_dict)
filtered = group3[(group3[columns_to_filter_by[0]] <= coeff_var_cutoff) | (group3[columns_to_filter_by[1]] <= coeff_var_cutoff)]
filtered2 = filtered[(filtered[columns_to_filter_by2[0]] >= norm_reads_cutoff) | (filtered[columns_to_filter_by2[1]] >= norm_reads_cutoff)]
print len(group3), 'before filtering'
print len(filtered), 'after filtering for coeffvar'
print len(filtered2), 'after filtering for coeffvar and read count'
filtered2.to_csv(str(output_folder)+str('/')+str(coeff_var_cutoff)+'_'+str(norm_reads_cutoff)+'_coeffvarANDreadcutoff_uncollapsedMBA.filtered_inserts', sep='\t', index=False)