-
Notifications
You must be signed in to change notification settings - Fork 4
/
robust_csv_parser.py
executable file
·232 lines (191 loc) · 13.7 KB
/
robust_csv_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
"""
robust_csv_parser.py
Python offers many functions to load a text file containing numbers, e.g. numpy.genfromtxt(), csv.reader() and pandas.read_table().
The reason why I spent two days writing a robust alternative is that the mentioned functions are not flexible enough to recognize
automatically the syntax of all data files which I encounter in my daily practice.
This module implements following improvements:
1) Automatic detection of column separators. This is also implemented by pandas.read_table() as option sep=None), but this currently
clashes with the options delim_whitespace=True, error_bad_lines=False. This module employs quite a robust detection.
2) It allows different comment characters like "!;,%" in a single file. Useful, but probably not implemented elsewhere.
3) It (usually) aborts reading binary files rather soon. In pandas, it can take a lot of time when one accidentally feeds it
with a big binary file.
4) It recognizes parameters in the file header, and returns them as a dict.
5) If not specified otherwise, it does not clog the stderr with error reports .
"""
## Import common moduli
import numpy as np
import re
import warnings
with warnings.catch_warnings(): warnings.simplefilter("ignore")
verbose =1 # False
very_verbose = False
commentCharsLineStart = ['#', '!', ';', ',', '%'] # if line starts with one of these characters, it will be a comment (or header)
parameterSeparators = ['=', '\t', ':', ','] # if line is a comment AND contains one of these, it is parsed as a parameter of the file
commentCharsLineMiddle = ['#'] # everything after this character is omitted
parameterSplitterOtherwise = ['='] # if line contains this character, it is parsed as a parameter of the file
strict_table_layout = False # when True, doubled column separators will always imply that the field was left empty
maxLineLength = 10000 # hardly any numeric table in ASCII will have more than one 10kB per line
allowCommaDecimalSep = True # note: causes wrong detection in true CSV (where , is column sep)
headerOrdinateAllowOmit = True # e.g. three-column CSV files sometimes have only two names in header
headerOrdinateSuggestName = 'x' # ... in such a case, the first column name will added
tryColSeparators = [';', ',', '\t', '\s', '\s+'] # possible ways of separating columns
## TODO test avoiding escaped whitespace, e.g. use "[^\\]\s" instead of "\s"
unevenColumnLengthPenalty = 1 # may be any positive number; low value may lead to joining cells, higher value may lead to empty cells detected
guessHeaderSpaces = True # Will try to expand CamelCase and unit names with spaces for better look of plots
def safe_float(string):
try: return float(string)
except: return np.NaN
def can_float(string):
try: float(string); return True ## note: "nan" counts the same as a number
except: return False
def filter_floats(arr): return [safe_float(field) for field in arr if can_float(field)]
def filter_non_floats(arr): return [ field for field in arr if (not can_float(field) and field!="")]
def floatableLen(arr): return len(filter_floats(arr))
def loadtxt(file_name, sizehint=None, encoding=None):
## Load file
encodings = ['utf-8', 'latin-1'] if not encoding else [encoding]
for _encoding in encodings:
try:
lines = open(file_name, encoding=_encoding).readlines(sizehint)
if verbose: print("Read %d lines" % len(lines))
break
except UnicodeDecodeError:
pass
else:
raise IOError('file %s could not be opened for reading' % file_name)
## Abort if overly long lines
if len(lines)>0 and max([len(line) for line in lines]) > maxLineLength:
raise IOError('Error: a line longer than %d characters found, the file is probably binary or corrupt' % maxLineLength)
## Filter out empty lines, and also all that are commented out. If they have a parameter-like syntax, store them in a dict.
filteredLines = []
parameters = {}
skippedLinesN = 0
firstNonSkippedLine = None
for lineNumber, line in enumerate(lines):
if line[0:1] in commentCharsLineStart:
for parameterSeparator in parameterSeparators:
regExpPar = re.compile(parameterSeparator)
if parameterSeparator in line[1:-1]:
paramKey, paramValue = regExpPar.split(line[1:-1], 1)
parameters[paramKey.strip()] = float(paramValue) if (safe_float(paramValue) is not np.NaN) else paramValue.strip()
break
else:
if firstNonSkippedLine is None: firstNonSkippedLine = lineNumber
filteredLines.append(line)
skippedLinesN += 1
if very_verbose: print("firstNonSkippedLine", firstNonSkippedLine)
## Cut line end after a comment character is found
for commentCharLineMiddle in commentCharsLineMiddle:
filteredLines = [line.split(commentCharLineMiddle,1)[0] for line in filteredLines]
## Remove empty lines (which could otherwise confuse the column number estimator)
filteredLines = [line for line in filteredLines if line.strip()!=""]
if filteredLines == []: raise RuntimeError("Error: all lines in the file are empty or identified as comments")
if very_verbose: print("filteredLines:", filteredLines)
## Handle files that use ',' instead of '.' as a decimal separator TODO different strategy needed
if allowCommaDecimalSep:
allChars = ''.join(filteredLines)
countComma, countDot = len(re.findall(r',[\d\s]', allChars)), len(re.findall(r'\.[\d\s]', allChars))
if very_verbose: print('countComma, countDot',countComma, countDot)
if countComma > countDot and countDot<len(filteredLines):
if verbose: print('detected that comma is used more often of dot, trying to accept it as a decimal separator',countComma, countDot)
filteredLines = [re.sub(r',([\d\s])', r'.\1', fl) for fl in filteredLines]
tryColSeparators.copy().remove(',')
## Find the number of columns
resultingColSeparatorFitness = None
for tryColSeparator in tryColSeparators:
## Split each line according to the tryColSeparator; count the data fields that can be converted to float
## TODO first column may be header, do not enforce floatability!
regExpSep = re.compile(tryColSeparator)
## In the case of tables of a very strict layout, duplicate separator or non-number field would represent a numpy.NaN value
chosen_len_fn = len if strict_table_layout else floatableLen
## (In the following, the inner comprehension returns a list of fields per each line; the outer comprehension assigns
## the whole line with the count of numbers it appears to contain. This naturally depends on our choice of the field separator.)
columnsOnLines = np.array([chosen_len_fn(splitLine) for splitLine in [regExpSep.split(line.strip()) for line in filteredLines]])
## If the first column is header, do not let it spoil the column-number statistics
if len(columnsOnLines)>1 and columnsOnLines[0]==0: columnsOnLines = columnsOnLines[1:]
## Compute the statistics of numeric-valued columns at each line
columnsOnLinesAvg = np.sum(columnsOnLines)/len(columnsOnLines)
columnsOnLinesSD = np.sum(columnsOnLines**2)/len(columnsOnLines)-columnsOnLinesAvg**2
tryColSeparatorFitness = columnsOnLinesAvg - unevenColumnLengthPenalty*columnsOnLinesSD**.5
#print( np.sum(columnsOnLines) , len(columnsOnLines) , np.sum(columnsOnLines)/len(columnsOnLines) , columnsOnLinesAvg , tryColSeparatorFitness)
## Choose parser settings that give the highest average and simultaneously the lowest deviation for the number of fields per line
## Note that use of < instead of <= is important to prefer less greedy regexp (e.g. "\s" over "\s*") and
## to preserve column order even if the table is "hollow", i.e. cells missing in its middle columns
if resultingColSeparatorFitness is None or resultingColSeparatorFitness < tryColSeparatorFitness:
resultingColSeparator = tryColSeparator
resultingColSeparatorFitness = tryColSeparatorFitness
resultingColumnsOnLines = int(columnsOnLinesAvg+0.99) ## (rounding up favors keeping more data whenever possible)
if very_verbose:
print("tryColSeparator columnsOnLines, columnsOnLinesAvg, int(columnsOnLinesAvg+0.99), tryColSeparatorFitness, "+
"resultingColumnsOnLines",
tryColSeparator, columnsOnLines, columnsOnLinesAvg, int(columnsOnLinesAvg+0.99), tryColSeparatorFitness,
resultingColumnsOnLines)
if resultingColumnsOnLines == 0: raise RuntimeError("Error: estimated that there are zero data columns")
if very_verbose: print("resultingColSeparator, resultingColumnsOnLines", resultingColSeparator, resultingColumnsOnLines)
## Detect the header in the last commented line before the data start; if the file contains no comments, try its very first line
maybeHeaderLine = lines[firstNonSkippedLine-1 if firstNonSkippedLine > 0 else 0]
while maybeHeaderLine[:1] in commentCharsLineStart and maybeHeaderLine != "": maybeHeaderLine = maybeHeaderLine[1:] # strip comment-out
regExpSep = re.compile(resultingColSeparator)
columnsInHeader = filter_non_floats(regExpSep.split(maybeHeaderLine.strip()))
## If the header consists of all number-like fields, do not consider it a header (in all cases; no matter if commented out or not)
if very_verbose:
print("columnsInHeader,len(columnsInHeader), floatableLen(columnsInHeader)",
columnsInHeader,len(columnsInHeader), floatableLen(columnsInHeader))
if floatableLen(columnsInHeader) == len(columnsInHeader):
columnsInHeader = []
if very_verbose: print("columnsInHeader after search", columnsInHeader)
## Match the header length to the detected column number
if len(columnsInHeader) != resultingColumnsOnLines:
if verbose:
warnings.warn(("Warning: found %d labels in header for total %d columns; truncating or adding next by the " +
"rule 'x, column1, column2...'") % (len(columnsInHeader), resultingColumnsOnLines), RuntimeWarning)
if len(columnsInHeader)==0 and resultingColumnsOnLines==1: columnsInHeader = ['values']
if len(columnsInHeader) == resultingColumnsOnLines-1: columnsInHeader = ['x'] + columnsInHeader
while len(columnsInHeader) < resultingColumnsOnLines: ## extend it if shorter
columnsInHeader.append("column%d"%len(columnsInHeader))
columnsInHeader = columnsInHeader[:resultingColumnsOnLines] ## truncate it if longer
if very_verbose: print("columnsInHeader after label adjustment to columns", columnsInHeader)
## Formatting of the header fields: Convert e.g. "sampleTemperature(K)" to "sample temperature (K)"
expandedColumnsInHeader = []
def camel_case_split(identifier): # from http://stackoverflow.com/a/29920015/1615108
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
#return [(m.group(0).lower() if not m.group(0).isupper() else m.group(0)) for m in matches] ## keep acronyms upper, convert other Camels to lower
return [m.group(0) for m in matches]
if guessHeaderSpaces:
for column in columnsInHeader:
if "(" in column[:-2] and ")" in column[-1:]:
lcol, rcol = column.rsplit('(',1)
column = " (".join([" ".join(camel_case_split(lcol.replace('_', ' '))), rcol])
elif "[" in column[:-2] and "]" in column[-1:]:
lcol, rcol = column.rsplit('[',1)
column = " [".join([" ".join(camel_case_split(lcol.replace('_', ' '))), rcol])
else:
column = " ".join(camel_case_split(column))
#if "[" in column[:-2] and "]" in column[-1:]: column = " [".join(column.rsplit('[',1))
expandedColumnsInHeader.append(column)
if very_verbose: print("expandedColumnsInHeader", expandedColumnsInHeader)
#columnsInHeaderFloatableN = [1-floatable(splitLine) for splitLine in [regExpSep.split(line.strip()) for line in filteredLines])
if very_verbose: print('firstNonSkippedLine is #%d and contains: "%s "' % (firstNonSkippedLine, maybeHeaderLine.strip()))
## Parse the actual numeric data
table_values = []
for line in filteredLines:
line_values = [safe_float(field) for field in filter_floats(regExpSep.split(line.strip()))]
while len(line_values) < resultingColumnsOnLines: line_values.append(np.NaN) ## extend line if shorter
line_values = line_values[:resultingColumnsOnLines] ## truncate line if longer
if line_values[0] is not np.NaN: ## todo fix - this is only to prevent an uncommented header adding a [NaN, NaN] first line in data
table_values.append(line_values)
data_array = np.array(table_values)
assert len(data_array) >= 1
assert len(expandedColumnsInHeader) == len(data_array[0])
if very_verbose:
print("data_array, expandedColumnsInHeader, parameters", data_array, expandedColumnsInHeader, parameters)
return data_array, expandedColumnsInHeader, parameters
if __name__ == "__main__":
import sys
fileName = sys.argv[1]
data_array, header, parameters = loadtxt(fileName)
print("DATA:", data_array)
print("HEADER:", header)
print("PARAMETERS:", parameters)