-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_adapter.py
178 lines (148 loc) · 5.42 KB
/
scrape_adapter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 17 13:11:16 2017
The purpose of this script is to scrape the fastq file and enter the sequencer
defined adapter to a list. We then use this adapter list to generate a '.fa'
file that cutadapt can use to perform trimming.
@author: pieter
"""
import sys
import subprocess
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
#set defaults
paired_arg = '-u'
adapter_list=[]
polyG = False
#collect arguments
if (sys.argv[1] == '-h') or (sys.argv[1] == '-help'):
print('scrape_adapter is designed to be used with fastq files.\nThese can be either in uncompressed of gzipped (".gz") formats.')
print('Invoke as:\n\tpython scrape_adapter.py [-polyG] [-p] -i fastq_file_name -o output_file_name')
print('-------------------------------------------------------------------')
print('Optional Commands:')
print('-p : paired end reads')
print('-polyG : Alternative Runmode. Removes all reads with a polyG UMI.')
print('\t\tNo adapters are counted when using -polyG run mode.')
print('')
exit()
for arg in range(len(sys.argv)-1):
if sys.argv[arg] == '-p':
paired_arg = '-p'
if sys.argv[arg] == '-i':
infile_name = sys.argv[int(arg+1)]
if sys.argv[arg] == '-o':
outfile_name = sys.argv[int(arg+1)]
if sys.argv[arg] == '-polyG':
polyG = True
#test to see if input is gzipped
#if it is use zcat to make temp uncompressed file
if infile_name.rsplit('.',1)[1]=='gz':
is_gzipped=True
else:
is_gzipped=False
# process gzipped file then open temp
if is_gzipped:
bashCommand = ('zcat %s > temp.fastq')%(infile_name)
output = subprocess.check_output([bashCommand],stderr=subprocess.STDOUT,shell=True)
infile_o = ('temp.fastq')
#open uncompressed file
if not is_gzipped:
infile_o = (infile_name)
# search for and remove instances where UMI is 'GGGGGG'
def remove_polyG(infile_o):
infile = open(infile_o)
filter_polyG = 0
polyG_ct = 0
for line in infile:
if filter_polyG != 0:
if filter_polyG == 3:
filter_polyG = 0
print(line)
print(filter_polyG)
if (filter_polyG > 0) and (filter_polyG < 3) :
filter_polyG+=1
print(line)
print(filter_polyG)
if filter_polyG == 0:
if (line[0]=='@'):
s_line = line.strip()
adapter = s_line.rsplit(':',1)[1]
if ('GGGGGG' in adapter):
filter_polyG = 1
polyG_ct+=1
print(line)
print(filter_polyG)
if ('GGGGGG' not in adapter):
outfile.write(line)
if (line[0]!='@') and (filter_polyG == 0):
outfile.write(line)
infile.close()
print('PolyG count: '+str(polyG_ct))
# parse file keeping only highest frequency adapter
def scrape_file(infile_o):
infile = open(infile_o)
adapter_dict = {}
high_score=0
winner=''
#parse file
for line in infile:
if line[0]=='@':
line = line.strip()
adapter = line.rsplit(':',1)[1]
if ('N' not in adapter):
if (adapter in adapter_dict):
adapter_dict[adapter]+=1
if (adapter not in adapter_dict):
adapter_dict[adapter]=1
print('Scraped the following adapters these many times:')
print(adapter_dict)
for key, value in adapter_dict.items():
if value > high_score:
high_score=value
winner=key
adapter_list.append(winner)
infile.close()
return(adapter_list)
#write out adapter
def write_outadapter(adapter_list,runmode):
if runmode =='fwd':
count = 0
print('Using Adapter' +str(adapter_list))
for adapter in adapter_list:
revadapter = Seq(adapter, IUPAC.unambiguous_dna)
adapter = revadapter.reverse_complement()
outline=('>FwdAdapter_%s\na%sagatcggaagagcacacgtctgaactccagtcacNNNNNNatctcgtatgccgtcttctgcttg\n')%(count,adapter)
print('Trimming with:')
print(outline)
count+=1
outfile.write(outline)
if runmode =='rev':
count = 0
print('Using Adapter' +str(adapter_list))
for adapter in adapter_list:
adapter = Seq(adapter, IUPAC.unambiguous_dna)
#adapter = revadapter.reverse_complement()
outline=('>RevAdapter_%s\naatgatacggcgaccaccgagatctacactctttccctacacgacgctcttccgatct%sa\n')%(count,adapter)
print('Trimming with:')
print(outline)
count+=1
outfile.write(outline)
#actual command statements
if not polyG:
if paired_arg=='-u':
adapter_list = scrape_file(infile_o)
if adapter_list:
outfile = open(outfile_name,'w')
write_outadapter(adapter_list,'fwd')
if paired_arg=='-p':
adapter_list = scrape_file(infile_o)
if adapter_list:
outfile = open(outfile_name,'w')
write_outadapter(adapter_list,'rev')
if polyG:
outfile = open(outfile_name,'w')
remove_polyG(infile_o)
#clean up
outfile.close()
if is_gzipped:
output = subprocess.check_output(['rm temp.fastq'],stderr=subprocess.STDOUT,shell=True)