forked from zygmuntz/phraug2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
csv2vw.py
142 lines (100 loc) · 3.31 KB
/
csv2vw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
'Convert CSV file to Vowpal Wabbit format.'
'Allows mixing of categorical and numerical data'
import sys
import csv
import argparse
def clean( item ):
return "".join( item.split()).replace( "|", "" ).replace( ":", "" )
def handle_label( label ):
try:
label = float( label )
if label == 0.0:
if args.convert_zeros:
label = "-1"
else:
label = "0"
elif label == 1.0:
label = '1'
except:
if label == '':
print "WARNING: a label is ''"
else:
print "WARNING: a label is '{}', setting to ''".format( label )
label = ''
return label
def construct_line( label, line ):
new_line = []
new_line.append( "{} |n".format( handle_label( label )))
# the rest
for i, item in enumerate( line ):
if i in ignore_columns_dict:
continue
if args.categorical:
# 1-based indexing here
new_item = "c{}_{}".format( i + 1, clean( item ))
else:
categorical = False
try:
item_float = float( item )
if item_float == 0.0:
continue # sparse format
except ValueError:
if item:
categorical = True
else:
continue
if categorical:
new_item = "c{}_{}".format( i + 1, clean( item ))
else:
new_item = "{}:{}".format( i + 1, item )
new_line.append( new_item )
new_line = " ".join( new_line )
new_line += "\n"
return new_line
# ---
parser = argparse.ArgumentParser( description = 'Convert CSV file to Vowpal Wabbit format.' )
parser.add_argument( "input_file", help = "path to csv input file" )
parser.add_argument( "output_file", help = "path to output file" )
parser.add_argument( "-s", "--skip_headers", action = "store_true",
help = "use this option if there are headers in the file - default false" )
parser.add_argument( "-l", "--label_index", type = int, default = 0,
help = "index of label column (default 0, use -1 if there are no labels)")
parser.add_argument( "-z", "--convert_zeros", action = 'store_true', default = False,
help = "convert labels for binary classification from 0 to -1" )
parser.add_argument( "-i", "--ignore_columns",
help = "zero-based index(es) of columns to ignore, for example 0 or 3 or 3,4,5 (no spaces in between)" )
parser.add_argument( "-c", "--categorical", action = 'store_true',
help = "treat all columns as categorical" )
parser.add_argument( "-n", "--print_counter", type = int, default = 10000,
help = "print counter every _ examples (default 10000)" )
args = parser.parse_args()
###
ignore_columns = []
if args.ignore_columns:
ignore_columns = args.ignore_columns.split( ',' )
ignore_columns = map( int, ignore_columns )
print "ignoring columns", ignore_columns
if args.label_index in ignore_columns:
raise ValueError, "You are not trying to ignore the label column, are you?"
# correct for later popping the label
if args.label_index >= 0:
ignore_columns = map( lambda x: x - 1 if x > args.label_index else x, ignore_columns )
# a dictionary for faster 'in'
ignore_columns_dict = { x: 1 for x in ignore_columns }
###
i = open( args.input_file )
o = open( args.output_file, 'wb' )
reader = csv.reader( i )
if args.skip_headers:
headers = reader.next()
n = 0
for line in reader:
if args.label_index < 0:
label = 1
else:
label = line.pop( args.label_index )
new_line = construct_line( label, line )
o.write( new_line )
n += 1
if n % args.print_counter == 0:
print n