-
Notifications
You must be signed in to change notification settings - Fork 11
/
randomSample.py
executable file
·119 lines (95 loc) · 3.33 KB
/
randomSample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
# Copyright (c) 2012,2013, Stephen Fisher and Junhyong Kim, University of
# Pennsylvania. All Rights Reserved.
#
# You may not use this file except in compliance with the Kim Lab License
# located at
#
# http://kim.bio.upenn.edu/software/LICENSE
#
# Unless required by applicable law or agreed to in writing, this
# software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License
# for the specific language governing permissions and limitations
# under the License.
"""
by: S. Fisher, 2012
usage: python randomSample.py <num lines> <lines grouped> <input> <output>
This will return a file that contains the specified number of randomly
sampled lines from the original file. If 'lines grouped' is greater
than 1, then each time a line is selected, the specified number of
lines (grouping size) will also be include. For example a line
grouping of 4 means 4 lines will be included every time a line is
selected, as in the case of a FASTQ file.
"""
import sys, os, subprocess, random
#------------------------------------------------------------------------------------------
#
#------------------------------------------------------------------------------------------
DEBUG = 0
if DEBUG: print 'DEBUG MODE: ON'
# expect 2 args
if len(sys.argv) < 3:
print 'Usage: python randomSample.py <num lines> <lines grouped> <input> <output>'
sys.exit()
NUM_LINES = int(sys.argv[1])
LINES_GROUPED = int(sys.argv[2])
IN_FILE = sys.argv[3]
OUT_FILE = sys.argv[4]
inFile = open(IN_FILE, 'r')
outFile = open(OUT_FILE, 'w')
# get number of lines
def file_len(fname):
p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
result, err = p.communicate()
if p.returncode != 0: raise IOError(err)
return int(result.strip().split()[0])
def error(msg):
print msg
inFile.close()
outFile.close()
quit()
totalLines = file_len(IN_FILE) / LINES_GROUPED
# make sure the input file isn't empty
if NUM_LINES == 0:
error("ERROR: Empty file\n")
# make sure there are enough lines in the input file
if NUM_LINES > totalLines:
print "WARNING: Not enough lines in the input file, so entire input file will be used"
NUM_LINES = totalLines
lines = random.sample(xrange(totalLines), NUM_LINES)
lines.sort()
if DEBUG: print "totalLines: ", totalLines, "NUM_LINES: ", NUM_LINES
if DEBUG: print "lines: ", lines
i = 0
j = 0
numSampled = 0
for lnum in lines:
while i < lnum:
# assume lines are grouped, as in FASTQ files.
g = 0
while g < LINES_GROUPED:
if DEBUG: print "searching j: ", j, "i: ", i, "g: ", g
line = inFile.readline()
if not line:
# should never get here because above we set NUM_LINES
# to be no greater than totalLines
error("ERROR: Ran out of lines in the file\n")
g += 1
j += 1
i += 1
if DEBUG: print "found one", i
g = 0
while g < LINES_GROUPED:
if DEBUG: print "found j: ", j, "i: ", i, "g: ", g
line = inFile.readline()
if not line: error()
outFile.write(line)
g += 1
j += 1
i += 1
numSampled += 1
inFile.close()
outFile.close()
print "Total reads:", totalLines
print "Reads Sampled:", numSampled