-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmajsat.py
189 lines (163 loc) · 7.66 KB
/
majsat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
### FIRST: RUN THIS ###
import subprocess
import regex
import csv
import statistics
from Bio import SeqIO
from Bio import SearchIO
# change path to folder where you downloaded fasta reads
path = "/Users/damian/Downloads/sratoolkit.3.1.1-mac-arm64/bin/SRR11606870/fasta/"
# get all records with the GAAAACTGAAAA motif specific to major satellite
input_iterator = SeqIO.parse(path + "SRR11606870.fasta", "fasta")
Maj_iterator = (record for record in input_iterator if "GAAAACTGAAAA" in record.seq
and "ATTCGTTGGAAACGGGA" not in record.seq) # avoid mixed zone of Maj and Min sat
SeqIO.write(Maj_iterator, path + "SRR11606870_Maj.fasta", "fasta")
# make blast database locally (modify path to match the path with downloaded fasta reads)
cmd = "makeblastdb -in /Users/damian/Downloads/sratoolkit.3.1.1-mac-arm64/bin/SRR11606870/fasta/SRR11606870_Maj.fasta -dbtype nucl"
subprocess.run(cmd, shell=True)
### ------------------------------------------------------------------------------------------###
### SECOND: Copy the SRR11606870_2342980.fasta file from github cloned folder into the path directory
# if you havent done it yet ###
### ------------------------------------------------------------------------------------------###
### THIRD: RUN THE REST OF THE CODE ###
# blast representative Type 1 Continuous Maj sat array (2342980) from Packiaraj and Thakur 2024 Genome Biol (PMID: 38378611)
path2 = path + "SRR11606870_Maj.fasta"
homo_cont_array_path = path + "SRR11606870_2342980.fasta" # make sure this read as fasta file is in the same folder
outpath = path + "SRR11606870_Maj.xml"
cmd3 = "blastn -db " + path2 + " -query " + homo_cont_array_path + " -outfmt 5 -out " + outpath
subprocess.run(cmd3, shell=True)
# gets blast results
blast_qresult = SearchIO.read(outpath, "blast-xml")
hit_keys = blast_qresult.hit_keys # gets IDs of hits
hit_iterator = SeqIO.parse(path + "SRR11606870_Maj.fasta", "fasta") # generator to iterate fast through
hit_iterator_blasted = (record for record in hit_iterator if record.id in hit_keys) # get sequences of hits
SeqIO.write(hit_iterator_blasted, path + "SRR11606870_Maj_blasted.fasta", "fasta") # save to file
# convert reads to oneliners
with open(path + "SRR11606870_Maj_blasted.fasta", "r") as f:
reads = f.readlines()
d = {}
header = ">"
seq = ""
for line in reads:
if line.startswith(">"):
head = line.lstrip(">").rstrip("\n")
seq = ""
else:
seq = seq + line.rstrip("\n").upper()
d[head] = seq
with open(path + "SRR11606870_Maj_blasted_oneliner.fasta", 'w') as convert_file:
for k, v in d.items():
convert_file.write(">" + k.rstrip("\n"))
convert_file.write("\n")
convert_file.write(v + "\n")
# find >=4 AT stretches
Maj_sat_ATstretches = {}
density_list = []
with open(path + "SRR11606870_Maj_blasted_oneliner.fasta", "r") as f:
reads = f.readlines()
for line in reads:
if line.startswith(">"):
id = line
read = ""
else:
read = line.rstrip("\n")
ATstretches = regex.findall(r"[AT]{4,}", read, overlapped=False)
nr_stretches = len(ATstretches)
density = len(ATstretches)/(len(read)/234)
density_list.append(density)
Maj_sat_ATstretches[id] = read, ATstretches, nr_stretches, density
# write into file
with open(path + "SRR11606870_Maj_ATstretches_average.fasta", 'w') as convert_file:
for k, v in Maj_sat_ATstretches.items():
convert_file.write(k.rstrip("\n"))
convert_file.write("\n")
convert_file.write(v[0])
convert_file.write("\n")
convert_file.write(str(v[1]))
convert_file.write("\n")
convert_file.write("Number of stretches in array = %s" % str(v[2]))
convert_file.write("\n")
convert_file.write("Density of stretches per 234bp = %s" % str(v[3]))
convert_file.write("\n")
convert_file.write("\nAverage density of stretches per 234bp = %s" % statistics.mean(density_list))
convert_file.write("\nAll densities of stretches per 234bp = %s" % density_list)
# Save densities into file
with open(path + "SRR11606870_Maj_ATstretches_average.csv", "w") as f:
write = csv.writer(f)
write.writerow(density_list)
# Compute most narrow tetranucleotides from Rohs et al., 2009 Nature (PMID: 19865164)
ATs = ["AAAT", "AATA", "AATC", "AATT", "AAAA", "AAGT", "GAAT", "GAAA", "TAAT", "AAAC", "ATAA",
"AGAT", "AAGA", "AGTT", "AGAA", "AAAG", "ATAG", "GAAC", "CGTT", "TATA"]
Maj_sat_ATstretches = {}
density_list = []
stretches = {}
stretches_total = {}
for s in ATs:
stretches_total[s] = 0
with open(path + "SRR11606870_Maj_blasted_oneliner.fasta", "r") as f:
reads = f.readlines()
total_length = 0
for line in reads:
if line.startswith(">"):
id = line
read = ""
else:
read = line.rstrip("\n")
nr_stretches = 0
for stretch in ATs:
ATstretches = regex.findall(stretch, read, overlapped=True)
stretches[stretch] = len(ATstretches)
nr_stretches += len(ATstretches)
stretches_total[stretch] += len(ATstretches)
length = len(read)
total_length += length
density = nr_stretches/(length/234)
density_list.append(density)
Maj_sat_ATstretches[id] = read, stretches, nr_stretches, density, length
with open(path + "SRR11606870_Maj_teranucleotides_average.fasta", 'w') as convert_file:
for k, v in Maj_sat_ATstretches.items():
convert_file.write(k.rstrip("\n"))
convert_file.write("\n")
convert_file.write(v[0])
convert_file.write("\n")
convert_file.write(str(v[1]))
convert_file.write("\n")
convert_file.write("Number of stretches in array = %s" % str(v[2]))
convert_file.write("\n")
convert_file.write("Density of stretches per 234bp = %s" % str(v[3]))
convert_file.write("\n")
convert_file.write("\nAverage density of stretches per 234bp = %s" % statistics.mean(density_list))
for stretch, number in stretches_total.items():
convert_file.write("\nAverage density of %s stretches per 234bp = %s" % (stretch, str(number/(total_length/234))))
narrow_ATs = ["AAAT", "AATA", "AATC", "AATT", "AAAA", "AAGT", "GAAT", "GAAA", "TAAT", "AAAC"]
with open(path + "SRR11606870_Maj_blasted_oneliner.fasta", "r") as f:
lines = f.readlines()
total_length = 0
wanted_array = False
for line in lines:
# select the array you want to analyze (change ID number)
if "2342980" in line:
wanted_array = True
if not line.startswith(">") and wanted_array:
seq = line.rstrip("\n")
break
bin_size = 1000
num_bins = (len(seq) + bin_size - 1) // bin_size # number of bins needed
### Use sliding window to get tetras overlapping and their positions
# Get dict where each tetra will get its frequency
Maj_freq_dict_over = {bin_index: {tetra: 0 for tetra in narrow_ATs} for bin_index in range(num_bins)}
for i in range(len(seq) - 3):
# Get current tetra
current_tetra = seq[i:i + 4]
# Put it in the current bin
bin_index = i // bin_size
# See if that tetra is narrow based on the list of ATs and if so, count it
if current_tetra in narrow_ATs:
Maj_freq_dict_over[bin_index][current_tetra] += 1
# write into file (with help from https://stackoverflow.com/questions/29400631/python-writing-nested-dictionary-to-csv)
with open(path + "SRR11606870_Maj_tetranucleotides.csv", "w") as csvfile:
writer = csv.DictWriter(csvfile, narrow_ATs)
for key, val in sorted(Maj_freq_dict_over.items()):
row = {"AAAT": key}
row.update(val)
writer.writerow(row)