-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathconvert_poolcount_to_fastqpooledreadsclean_v2.py
164 lines (133 loc) · 5.37 KB
/
convert_poolcount_to_fastqpooledreadsclean_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import sys
from sys import argv
import pandas as pd
import numpy as np
import re
import string
### USAGE ###
# python organize_and_filter_genes.py output_folder filtered_insert_ratio_file.poolcounts
#adds rel_loc
#splits tech reps and converts poolcount file formatted for this into .fastq_pooled_read_clean for downstream rh-seq pipeline
##PARAMETERS##
gff='/usr2/people/mabrams/carlygithub/rh-seq/YS2+CBS433+plasmid_clean'
control_temp='28C' #assumes temp in name, e.g. 'Biorep1_28C_66'
#test_temp='4C'
#test_temp='39C'
test_temp='38'
##FUNCTIONS##
def parse_gff(gff):
gff_dict = {}
f = open(gff)
for line in f:
line = line.split("\t")
chrom = line[1]
gene = line[0]
start = int(line[4])
end = int(line[5])
strand = line[3]
type = line[2]
# put the annotation information in a dictionary #
if chrom not in gff_dict:
gff_dict[chrom] = {}
gff_dict[chrom][gene] = [start, end, strand]
f.close()
return gff_dict
def parse_file(filename, sep='\t'):
'''parses the poolcount file into a dataframe'''
df = pd.read_csv(filename, sep=sep)
file_string = str(filename)
p = re.compile('[^/]+[^\.](?=\.)') # matches everything before the period and after the last slash, ie. the identifier of the file
m = p.search(file_string) # finds regex in file name
file_identifier = m.group() # prints match in string format
return df, file_identifier
def retool_columns(df):
#rename columns used in both but named differently
df=df.rename(columns={'barcode':'ID','pos':'location','AlternateID':'annotation','CodingFraction':'rel_prop'})
#get which columns have br n data
br_columns=df.columns[8:]
#drop unused columns
df=df.drop(columns=['LocalGCpercent','NearestGene','Annotation'])
#placeholders for columns without data
df['strand']='seePoolFile'
df['rel_loc']='notCalculated'
df['gene_length']='notCalculated'
#reorder columns to match Carly's format
cols = df.columns.tolist()
cols=cols[:4]+cols[5:-3]+[cols[-2]]+[cols[4]]+[cols[-1]]
print(cols)
#col_order=['ID','scaffold','location','annotation']+br_columns+['rel_loc','rel_prop','gene_length']
df=df[cols]
#return reordered dataframe and the br columns
return df,br_columns
def build_rename_dict(df,br_columns):
'''builds a dictionary of new names for the bioreps named in the scheme:
28A1 - biorep A of control temp, tech rep 1
39A1 - birep A of test temp, tech rep 1'''
control_brs=[]
test_brs=[]
t0_brs=[]
other_brs=[]
for col in br_columns:
if control_temp in col:
control_brs.append(col)
elif test_temp in col:
test_brs.append(col)
elif 'T0_' in col:
t0_brs.append(col)
else:
print('could not assign control or test temperature to column'+str(col))
other_brs.append(col)
print('control brs')
print(control_brs)
print('test brs')
print(test_brs)
#get alphabet for biorep identifiers
alphabet=string.ascii_uppercase
alphabet=alphabet.replace('E','') #remove E b/c bioreps named E introduce a bug downstream
#generate renamin dictionary
rename_dict={}
for i in range(len(control_brs)):
rename_dict[control_brs[i]]='28'+alphabet[i]+'1.fastq_pooled_reads_clean'
for i in range(len(test_brs)):
rename_dict[test_brs[i]]='39'+alphabet[i]+'1.fastq_pooled_reads_clean'
for i in range(len(t0_brs)):
rename_dict[t0_brs[i]]=t0_brs[i]+'.fastq_pooled_reads_clean'
for i in range(len(other_brs)):
rename_dict[other_brs[i]]='no outfile'
print(rename_dict)
return rename_dict
def split_br(df, rename_dict):
'''splits out the data into bioreps according to the new filenames for downstream processing'''
for br in rename_dict:
if rename_dict[br]!='no outfile':
other_brs=list(rename_dict.keys()) #get other brs
#print('all_outfiles')
#print(other_brs)
print('making file for br:')
print(br)
#print('all outfiles remove br')
other_brs.remove(br)
#print(other_brs)
br_df=df.copy() #drop other from a copy of df
for other_br in other_brs:
br_df.drop(other_br, 1, inplace=True)
br_df.rename(columns = {br:'n'}, inplace = True)
br_rename=rename_dict[br]
br_df.to_csv(br_rename, sep='\t', index=False)
return
##START
if __name__ == '__main__':
print('...done importing libraries...')
output_folder = argv[1].strip('/')
poolcount_file = argv[2]
print('...parsing gff...')
gff_dict=parse_gff(gff)
print('...parsing poolcounts file...')
df,file_identifier=parse_file(poolcount_file)
print('...renaming and adding columns...')
df,br_columns=retool_columns(df)
print(df.columns)
print('...splitting by biorep...')
rename_dict=build_rename_dict(df,br_columns)
split_br(df,rename_dict)
print('...done...')