forked from shenwei1376/EEB5300
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCode_week3_Wei S2.py
95 lines (76 loc) · 2.76 KB
/
Code_week3_Wei S2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 18 10:17:49 2017
@author: weishen
"""
### 1. open the fasta file and save it to the variable lines
fh=open("/Users/weishen/documents/Courses/EEB5300/week3_codechallenge/PitaIlluminaGeneSet.fasta","r")
lines=fh.readlines()
### 2. print the number of genes in the fasta file
genecounts=0
for line in lines:
if line[0]==">":
genecounts=genecounts+1
print genecounts
### 3. calculate statistics for the given file
# creating 3 lists containing gene name, gene sequence and sequence length
SequenceNames=[]
Sequences=[]
SequenceLength=[]
for line in lines:
if line[0]==">":
SequenceNames.append(line)
elif line[0]!=">":
Sequences.append(line)
#counting the nucleotides
nucleotides=0
for nuc in line:
if nuc=="A":
nucleotides=nucleotides+1
elif nuc=="a":
nucleotides=nucleotides+1
elif nuc=="T":
nucleotides=nucleotides+1
elif nuc=="t":
nucleotides=nucleotides+1
elif nuc=="G":
nucleotides=nucleotides+1
elif nuc=="g":
nucleotides=nucleotides+1
elif nuc=="C":
nucleotides=nucleotides+1
elif nuc=="c":
nucleotides=nucleotides+1
SequenceLength.append(nucleotides)
# a. print the minmum length of the sequence and the sequence name associated with this minmun length
indexmin=SequenceLength.index(min(SequenceLength))
MinimumSeq=Sequences[indexmin]
print MinimumSeq
MinimumSeqName=SequenceNames[indexmin]
print MinimumSeqName
# b. print the maximum length of the sequence and the sequence name associated with this maximum length
indexmax=SequenceLength.index(max(SequenceLength))
MaximumSeq=Sequences[indexmax]
print MaximumSeq
MaximumSeqName=SequenceNames[indexmax]
print MaximumSeqName
# c. print the average length of the sequences
AverageLen=sum(SequenceLength)/490
print AverageLen
### output a file with sequence anmes whoses length is less than 500 base pairs
# extract the position information for all the genes that are less than 500 bps
indexless500=[]
for len in SequenceLength:
if len<500:
indexless500.append(SequenceLength.index(len))
# extract the genes based on the position information and put them into a new list of PitalluminaGeneSetSub
PitaIlluminaGeneSetSub=[]
for i in indexless500:
PitaIlluminaGeneSetSub.append(lines[i*2])
PitaIlluminaGeneSetSub.append(lines[i*2+1])
# write the genes into an output file
with open("temp.fasta", "w") as fout:
for item in PitaIlluminaGeneSetSub:
fout.write(item)
### the temp.fasta file appear in my home directory: /Users/weishen/