forked from melanieabrams/bremdata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path7fitness_ratios_uncollapsedMBA_nozeros_unpaired.py
65 lines (53 loc) · 2.4 KB
/
7fitness_ratios_uncollapsedMBA_nozeros_unpaired.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from sys import argv
import sys
import pandas as pd
import numpy as np
import math
import re
### USAGE ###
# python fitness_ratios.py output_folder file_location/filtered_file.filtered_inserts
# for each insert, calculates log2(39/28) using final normalized read counts
def parse_file(filename, sep='\t'):
df = pd.read_csv(filename, sep=sep)
file_string = str(filename)
p = re.compile('[^/]+[^\.](?=\.)') # matches everything before the period and after the last slash, ie. the identifier of the file
m = p.search(file_string) # finds regex in file name
file_identifier = m.group() # prints match in string format
return df, file_identifier
if __name__ == '__main__':
output_folder = argv[1].strip('/')
files = argv[2:]
file_id_dict = {}
df_list = []
for each_file in files:
df, file_identifier = parse_file(each_file)
df.set_index('ID', inplace=True, drop=False)
df_list.append(df)
file_id_dict[file_identifier] = df
for file_identifier, each_df in file_id_dict.iteritems():
each_df.replace(to_replace=0.0, value=np.nan, inplace=True)
columns = each_df.columns.values
read_columns = [col for col in columns if '_averaged_reads'or '_n_av' in col]
reads_28 = []
reads_39 = []
reads_T0 = []
for each_col in read_columns:
if each_col.startswith('28'):
reads_28.append(each_col)
if each_col.startswith('39'):
reads_39.append(each_col)
if each_col.startswith('T0'):
reads_T0 = each_col
for biorep39 in reads_39:
repID = biorep39[2:]
for biorep in reads_28:
if biorep == '28_br_averaged_reads':
avg28 = biorep
div39_28 = each_df[biorep39]/each_df[avg28]
colName = '39_28_log2' + repID
print(colName)
each_df[colName] = np.log2(div39_28)
#each_df=each_df.replace([np.inf, -np.inf,np.nan], 'None')
each_df=each_df.replace([np.inf, -np.inf], np.nan)
print('writing csv...')
each_df.to_csv(str(output_folder)+str('/')+str(file_identifier)+'_unpaired.insert_ratios', sep='\t', index=False)