forked from IanOlin/GeneFinder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload.py
88 lines (71 loc) · 2.6 KB
/
load.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 1 22:02:04 2014
@author: pruvolo
"""
from os import path
def load_seq(fasta_file):
""" Reads a FASTA file and returns the DNA sequence as a string.
fasta_file: the path to the FASTA file containing the DNA sequence
returns: the DNA sequence as a string
"""
retval = ""
f = open(fasta_file)
lines = f.readlines()
for l in lines[1:]:
retval += l[0:-1]
f.close()
return retval
def load_nitrogenase_seq():
""" This function loads a sequence of DNA that is known to code for
Nitrogenase. Nitrogenase is an enzyme that fixes atmospheric
Nitrogen (N_2)
returns: the nucleotides in the DNA sequence as a string
"""
f = open(path.join('.', 'data', 'nitrogenase NifH sequence.txt'), 'r')
nitrogenase = f.readlines()
f.close()
# remove the first line as it is simply a sequence name.
nitrogenase = nitrogenase[1:]
for i, line in enumerate(nitrogenase):
nitrogenase[i] = line[9:].replace(' ', '').replace('\r\n', '')
nitrogenase = ''.join(nitrogenase).upper()
return nitrogenase
def extract_next_gene(metagenome_lines, next_line):
""" A helper function for load_metagenome. This function
takes an array of lines from the metagenome file and
the next_line for processing.
returns: a tuple consisting of the name of the snippet,
the sequence of the snippet, and the line number
to process next.
"""
name = metagenome_lines[next_line].strip()[1:]
next_line += 1
start_line = next_line
while next_line < len(metagenome_lines):
if metagenome_lines[next_line][0] == '>':
break
next_line += 1
return (name,
''.join([l.strip() for l in
metagenome_lines[start_line:next_line]]),
next_line)
def load_metagenome():
""" Loads a metagenome of a bacterial contig.
returns: a list of DNA snippets consisting of (name, sequence)
tuples. The sequence is represented as an uppercase
string of nucleotides
"""
f = open(path.join('.',
'data',
'3300000497.a_metagenome_phototrophic community.fna'),
'r')
metagenome_lines = f.readlines()
f.close()
next_line = 0
snippets = []
while next_line < len(metagenome_lines):
(label, dna, next_line) = extract_next_gene(metagenome_lines,
next_line)
snippets.append((label, dna.upper()))
return snippets