-
Notifications
You must be signed in to change notification settings - Fork 9
/
separators-pre.js
224 lines (195 loc) · 6.86 KB
/
separators-pre.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
// To compile run:
// npm install browserify
// npm install json2csv
// browserify separators-pre.js > separators.js
// This will create a bundle that includes cvs2json.
//
const suffixToSep = new Map([
["csv", ","],
["tsv", "\t"],
["psv", "|"],
["ssv", ";"],
]);
var enc = new TextEncoder(); // always utf-8
var dec = new TextDecoder(); // always utf-8
let toHex = x => x.toString(16).padStart(2,'0')
function getPossibleSeps(seps) {
// Guess the separator as the char that occurs a consistent number of times
// in each line.
let s = [];
for (const [k, v] of seps) {
let max = Math.max(...v);
if (!max)
continue
let min = Math.min(...v);
if (min == max) {
s.push(k);
}
}
return s;
}
// Returns the guessed separator as a decimal integer, e.g. '\t' as 9.
// Returns -1 if we can't figure out the separator used in the file.
function guessSeparatorFromData(d) {
// Number of lines to use for guessing is the number of newlines, to a max of 10
let ltc = Math.min(d.filter(x => x === 0x0a).length, 10);
let seps = new Map();
suffixToSep.forEach((x,y,z) => seps.set(enc.encode(x)[0], new Array(ltc).fill(0)));
let cl = 0; // line count
let skip = false;
// Count the appearances of each separator in each line.
for (let i = 0; i < d.byteLength; i++) {
// Ignore anything inside double quotes
if (d[i] == 0x22) { skip = !skip; }
if (skip) { continue; }
// Check for newline (\n)
if (d[i] == 0x0a) {
cl++;
if (cl == ltc)
break;
continue;
}
if (seps.has(d[i])) {
let cv = seps.get(d[i]);
cv[cl]++
seps.set(d[i], cv);
}
}
let s = getPossibleSeps(seps);
if (s.length != 1)
return -1;
return s[0];
}
// Take all the sheets in a workbook and return them as an
// array of [csvdata, tablename]
function convertExcelToCSV(d, filename) {
var data = new Uint8Array(d);
var wb = XLSX.read(data,{type:'array'});
let sheets = [];
for (let i = 0, l = wb.SheetNames.length; i < l; i += 1) {
let s = wb.Sheets[wb.SheetNames[i]];
var csv = XLSX.utils.sheet_to_csv(s, { type:'array', header: 1 });
sheets.push([enc.encode(csv), filename + wb.SheetNames[i]]);
}
return sheets;
}
// Take all the sheets in a workbook and return them as an
// array of [csvdata, tablename]
function convertJSONToCSV(d) {
const { Parser } = require('json2csv');
const opts = { flatten: true, flattenSeparator: '__' };
const parser = new Parser(opts);
const csv = parser.parse(JSON.parse(dec.decode(d)));
return csv;
}
function guessSeparator(filename, data) {
let suff = filename.slice(-3);
if (suffixToSep.has(suff)) {
return [toHex(enc.encode(suffixToSep.get(suff))[0]), suffixToSep.get(suff)];
}
// Use the first 10,000 bytes for guessing.
let d = new Uint8Array(data.slice(0,10000));
let s = guessSeparatorFromData(d);
// Special case decoding tab to '\t'.
let h = toHex(s);
if (s == 9) {
s = '\t';
} else {
s = dec.decode(new Uint8Array([s]));
}
return [h, s];
}
global.getDataAndSeparator = function (d, filename) {
// Handle spreadsheets.
if ([".xls", ".xlsx", ".ods"].some(e => filename.endsWith(e))) {
let dt = convertExcelToCSV(d, filename);
let header = hasHeader(dt, ',');
return [convertExcelToCSV(d, filename), '2c', header];
}
// Handle JSON
if ([".json", ".js"].some(e => filename.endsWith(e))) {
let dt = convertJSONToCSV(d);
let header = hasHeader(dt, ',');
let r = enc.encode(dt);
return [[[r, filename]], '2c', header];
}
let [sep, sepAsText] = guessSeparator(filename, d);
let i = indexToFirstLine(d, sepAsText);
d = d.slice(i);
let header = hasHeader(d, sepAsText);
return [[[d, filename]], sep, header];
}
function indexToFirstLine(data, sep) {
let d = new Uint8Array(data.slice(0,10000));
let indexOfFirstSep = d.indexOf(enc.encode(sep)[0]);
let lineBreak = d.indexOf(0x0a);
// Line break occurs after the first instance of the separator, so there
// are no leading lines.
if (lineBreak > indexOfFirstSep) {
return 0;
}
// Search for the first line break before the first occurence
// of our separator.
let prevLineBreak = 0;
while (lineBreak < indexOfFirstSep) {
prevLineBreak = lineBreak + 1;
lineBreak = d.indexOf(0x0a, lineBreak + 1);
// Just in case there's no line break at all.
if (lineBreak < 0) {
return 0;
}
}
return prevLineBreak;
}
function hasHeader(data, s) {
let d = new Uint8Array(data.slice(0,10000));
d = new Uint8Array(d.slice(0,d.indexOf(0x0a)));
let st = dec.decode(d);
let a = st.split(s);
let u = [...new Set(a)];
// Duplicate values in the header line suggest it is not a header.
if (u.length != a.length) {
return false;
}
// More than one non-alphanumeric field suggests not a header.
var an = new RegExp("[^0-9\-\.]\+");
let nums = a.filter(x => an.test(x) == false).length;
if (nums > 1) {
return false;
}
return true;
}
function testGuessSeparatorFromData() {
let d = enc.encode("a,b,c\n1,2,3\n4,5,6");
let result = toHex(guessSeparatorFromData(d));
console.log((result == '2c' ? 'Pass. ' : 'FAIL. ') + 'Detect CSV ' + '. Actual result:', result);
d = enc.encode("a\tb\tc\n1\t2\t3\n4\t5\t6");
result = toHex(guessSeparatorFromData(d));
console.log((result == '09' ? 'Pass. ' : 'FAIL. ') + 'Detect TSV ' + '. Actual result:', result);
d = enc.encode("a|b|c\n1|2|3\n2|3|4");
result = toHex(guessSeparatorFromData(d));
console.log((result == '7c' ? 'Pass. ' : 'FAIL. ') + 'Detect pipe ' + '. Actual result:', result);
d = enc.encode("a,\"b,e\",c\n1,2,3\n4,5,6");
result = toHex(guessSeparatorFromData(d));
console.log((result == '2c' ? 'Pass. ' : 'FAIL. ') + 'Detect CSV with double quotes in header ' + '. Actual result:', result);
}
function testHeaderDetection() {
let d = enc.encode("a,b,c\n1,2,3\n4,5,6");
let result = hasHeader(d,',');
console.log((result == true ? 'Pass. ' : 'FAIL. ') + 'Detect header ' + '. Actual result:', result);
d = enc.encode("a,b,b\n1,2,3\n4,5,6");
result = hasHeader(d,',');
console.log((result == false ? 'Pass. ' : 'FAIL. ') + 'Detect duplicate items in header ' + '. Actual result:', result);
d = enc.encode("a,1,2\n1,2,3\n4,5,6");
result = hasHeader(d,',');
console.log((result == false ? 'Pass. ' : 'FAIL. ') + 'Detect numbers header ' + '. Actual result:', result);
d = enc.encode("a,b,c\n1,2,3\n4,5,6");
result = hasHeader(d,',');
console.log((result == true ? 'Pass. ' : 'FAIL. ') + 'Detect numbers header ' + '. Actual result:', result);
d = enc.encode("a,b1,c2\n1,2,3\n4,5,6");
result = hasHeader(d,',');
console.log((result == true ? 'Pass. ' : 'FAIL. ') + 'Detect numbers header ' + '. Actual result:', result);
d = enc.encode("2021-02-01,2021-02-01,c\n1,2,3\n4,5,6");
result = hasHeader(d,',');
console.log((result == fail ? 'Pass. ' : 'FAIL. ') + 'Detect numbers header ' + '. Actual result:', result);
}