-
Notifications
You must be signed in to change notification settings - Fork 0
/
dna.py
100 lines (75 loc) · 2.83 KB
/
dna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import csv
import sys
def main():
# Check for command-line usage
if len(sys.argv) != 3:
print("Missing command-line argument")
sys.exit(1)
# Read database file into a variable
dna_counts = []
sequence = ''
subsequences = []
db_filename = sys.argv[1]
with open(db_filename) as db_file:
database = csv.DictReader(db_file)
for row in database:
for key, value in row.items():
if key != 'name':
row[key] = int(value)
subsequences.append(key)
dna_counts.append(row)
# for dna_count in dna_counts:
# print(f'{dna_count}')
# Read DNA sequence file into a variable
sq_filename = sys.argv[2]
with open(sq_filename) as sq_file:
sequence = sq_file.read()
# print(f'\nsequence: {sequence}')
# Find longest match of each STR in DNA sequence
matches = {}
for subsequence in subsequences:
longest_run = longest_match(sequence, subsequence)
matches[subsequence] = longest_run
# print(f'\nmatches:\n {matches}')
# Check database for matching profiles
for i, dna_count in enumerate(dna_counts):
# print(i, dna_count)
count = 0
for key, value in matches.items():
if dna_count[key] == value:
count += 1
else:
break
if count == len(matches) and i < len(dna_counts):
print(dna_count['name'])
break
elif count != len(matches) and i == len(dna_counts) - 1:
print('No match')
def longest_match(sequence, subsequence):
"""Returns length of longest run of subsequence in sequence."""
# Initialize variables
longest_run = 0
subsequence_length = len(subsequence)
sequence_length = len(sequence)
# Check each character in sequence for most consecutive runs of subsequence
for i in range(sequence_length):
# Initialize count of consecutive runs
count = 0
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
# If a match, move substring to next potential match in sequence
# Continue moving substring and checking for matches until out of consecutive matches
while True:
# Adjust substring start and end
start = i + count * subsequence_length
end = start + subsequence_length
# If there is a match in the substring
if sequence[start:end] == subsequence:
count += 1
# If there is no match in the substring
else:
break
# Update most consecutive matches found
longest_run = max(longest_run, count)
# After checking for runs at each character in seqeuence, return longest run found
return longest_run
main()