diff --git a/.gitignore b/.gitignore index 3972ba99..5ffd9c3e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,6 @@ npm-debug.log .nyc_output dist !test/test-files/*.parquet +!test/reference-test/files/*.parquet examples/server/package-lock.json -test/browser/*.js \ No newline at end of file +test/browser/*.js diff --git a/lib/codec/index.ts b/lib/codec/index.ts index af182ab1..85a1e507 100644 --- a/lib/codec/index.ts +++ b/lib/codec/index.ts @@ -1,5 +1,5 @@ export * as PLAIN from './plain' export * as RLE from './rle' export * as PLAIN_DICTIONARY from './plain_dictionary' - +export * as RLE_DICTIONARY from './plain_dictionary' diff --git a/lib/reader.ts b/lib/reader.ts index 1be3ad48..4e7d15d1 100644 --- a/lib/reader.ts +++ b/lib/reader.ts @@ -672,8 +672,9 @@ export class ParquetEnvelopeReader { num_values: metadata.num_values }); - if (metadata.dictionary_page_offset) { - const offset = +metadata.dictionary_page_offset; + // If this exists and is greater than zero then we need to have an offset + if (metadata.dictionary_page_offset && +metadata.dictionary_page_offset > 0) { + const offset: number = +metadata.dictionary_page_offset; const size = Math.min(+this.fileSize - offset, this.default_dictionary_size); await this.read(offset, size, colChunk.file_path).then(async (buffer: Buffer) => { diff --git a/lib/shred.ts b/lib/shred.ts index 518aff8b..723a32a3 100644 --- a/lib/shred.ts +++ b/lib/shred.ts @@ -227,6 +227,8 @@ function materializeRecordField(record: Record, branch: Array

x.endsWith(".parquet") && !unsupported.includes(x)); + + for (const filename of listOfFiles) { + if (onlyTest && onlyTest !== filename) continue; + it(`Reading ${filename}`, async function () { + const reader = await parquet.ParquetReader.openFile(path.join(__dirname, 'files', filename)); + const schema = reader.getSchema(); + expect(schema.fieldList).to.have.length.greaterThan(0); + const cursor = reader.getCursor(); + const record = await cursor.next() as any; + // Expect the same keys as top-level fields + const expectedRecordKeys = schema.fieldList.filter(x => x.path.length === 1).map(x => x.name); + expect(Object.keys(record)).to.deep.equal(expectedRecordKeys); + }) + } +}); diff --git a/test/shred.js b/test/shred.js index 9fed2ab5..593285c9 100644 --- a/test/shred.js +++ b/test/shred.js @@ -498,11 +498,11 @@ describe('ParquetShredder', function() { assert.deepEqual( records[2], - { name: "kiwi", price: 99.0 }); + { name: "kiwi", price: 99.0, stock: null }); assert.deepEqual( records[3], - { name: "banana", stock: [{ warehouse: "C" }], price: 42.0 }); + { name: "banana", stock: [{ quantity: null, warehouse: "C" }], price: 42.0 }); }); it('should materialize a static nested record with blank optional value', function() { @@ -549,7 +549,7 @@ describe('ParquetShredder', function() { assert.deepEqual( records[0], - { fruit: { name: "apple" } }); + { fruit: { name: "apple", colour: null } }); }); diff --git a/test/test-files.js b/test/test-files.js index baf36dbf..4654bfbc 100644 --- a/test/test-files.js +++ b/test/test-files.js @@ -109,7 +109,7 @@ describe('test-files', function() { it('test-converted-type-null.parquet loads', async function() { const data = await readData('test-converted-type-null.parquet'); - assert.deepEqual(data,[{foo: 'bar'},{}]); + assert.deepEqual(data,[{foo: 'bar'},{foo: null}]); }); it('test-enum-type.parquet loads', async function() { @@ -119,12 +119,20 @@ describe('test-files', function() { it('test-null-dictionary.parquet loads', async function() { const data = await readData('test-null-dictionary.parquet'); - assert.deepEqual(data,[].concat.apply([{}],[...Array(3)].map( () => ([{foo: 'bar'}, {foo: 'baz'}])))); + assert.deepEqual( + data, + [ + { foo: null }, + { foo: 'bar' }, { foo: 'baz' }, + { foo: 'bar' }, { foo: 'baz' }, + { foo: 'bar' }, { foo: 'baz' } + ] + ); }); it('test-null.parquet loads', async function() { const data = await readData('test-null.parquet'); - assert.deepEqual(data,[{foo: 1, bar: 2},{foo: 1}]); + assert.deepEqual(data,[{foo: 1, bar: 2},{foo: 1, bar: null}]); }); it('test.parquet loads', async function() { @@ -146,7 +154,7 @@ describe('test-files', function() { const scale = schema.fields["value"].scale; assert.equal(scale, 2); const divider = 10 ** scale; - + for (let i = 0; i < data.length; i++) { const valueToMatch = i + 1; // Decimal values whose primitive types are fixed length byte array will @@ -160,11 +168,11 @@ describe('test-files', function() { assert.equal(numericalValue, valueToMatch); } }); - + it('byte_array_decimal.parquet loads', async function () { const schema = await readSchema('byte_array_decimal.parquet'); const data = await readData('byte_array_decimal.parquet'); - + const scale = schema.fields["value"].scale; assert.equal(scale, 2); const divider = 10 ** scale; @@ -173,7 +181,7 @@ describe('test-files', function() { const valueToMatch = i + 1; // Decimal values whose primitive types are byte array will // be returned as raw buffer values. - // For the test data, the actual decimal values and the corresponding buffer lengths + // For the test data, the actual decimal values and the corresponding buffer lengths // are small enough so we can treat the buffer as a positive integer and compare the values. // In reality, the user will need to use a more novel approach to parse the // buffer to an object that can handle large fractional numbers. @@ -188,4 +196,23 @@ describe('test-files', function() { assert.equal(decimalValue, valueToMatch); } }); + + describe("RLE", function () { + // Tracked in https://github.com/LibertyDSNP/parquetjs/issues/113 + it.skip('rle_boolean_encoding.parquet loads', async function() { + const data = await readData('rle/rle_boolean_encoding.parquet'); + assert.deepEqual(data[0],{ datatype_boolean: true }); + assert.deepEqual(data[1],{ datatype_boolean: false }); + }); + + it('rle-dict-snappy-checksum.parquet loads', async function() { + const data = await readData('rle/rle-dict-snappy-checksum.parquet'); + assert.deepEqual(data[0],{ binary_field: "c95e263a-f5d4-401f-8107-5ca7146a1f98", long_field: "0" }); + }); + + it('rle-dict-uncompressed-corrupt-checksum.parquet loads', async function() { + const data = await readData('rle/rle-dict-uncompressed-corrupt-checksum.parquet'); + assert.deepEqual(data[0],{ binary_field: "6325c32b-f417-41aa-9e02-9b8601542aff", long_field: "0" }); + }); + }) }); diff --git a/test/test-files/rle/rle-dict-snappy-checksum.parquet b/test/test-files/rle/rle-dict-snappy-checksum.parquet new file mode 100644 index 00000000..4c183d89 Binary files /dev/null and b/test/test-files/rle/rle-dict-snappy-checksum.parquet differ diff --git a/test/test-files/rle/rle-dict-uncompressed-corrupt-checksum.parquet b/test/test-files/rle/rle-dict-uncompressed-corrupt-checksum.parquet new file mode 100644 index 00000000..20e23aaa Binary files /dev/null and b/test/test-files/rle/rle-dict-uncompressed-corrupt-checksum.parquet differ diff --git a/test/test-files/rle/rle_boolean_encoding.parquet b/test/test-files/rle/rle_boolean_encoding.parquet new file mode 100644 index 00000000..6a6de0a9 Binary files /dev/null and b/test/test-files/rle/rle_boolean_encoding.parquet differ