From a3ef0ce4b37955f1952dea6560c4354df7623a31 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 26 May 2023 02:06:27 +0200 Subject: [PATCH 01/37] GH-15060: [JS] Add LargeUtf8 type --- js/src/Arrow.dom.ts | 4 +-- js/src/Arrow.ts | 3 +- js/src/builder.ts | 9 +++--- js/src/builder/buffer.ts | 5 +-- js/src/builder/largeutf8.ts | 44 +++++++++++++++++++++++++++ js/src/builder/list.ts | 4 +-- js/src/data.ts | 14 +++++++-- js/src/enum.ts | 1 + js/src/interfaces.ts | 10 ++++-- js/src/type.ts | 26 ++++++++++++++-- js/src/visitor/get.ts | 4 +-- js/src/visitor/jsonvectorassembler.ts | 5 ++- js/src/visitor/set.ts | 12 +++++--- js/src/visitor/typeassembler.ts | 5 +++ js/src/visitor/typecomparator.ts | 4 ++- js/src/visitor/typector.ts | 1 + 16 files changed, 124 insertions(+), 27 deletions(-) create mode 100644 js/src/builder/largeutf8.ts diff --git a/js/src/Arrow.dom.ts b/js/src/Arrow.dom.ts index 2fdef60c1fb55..b0423d2e9adeb 100644 --- a/js/src/Arrow.dom.ts +++ b/js/src/Arrow.dom.ts @@ -47,7 +47,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -94,5 +94,5 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, + Utf8Builder, LargeUtf8Builder } from './Arrow.js'; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index 4a6394c266b1b..9d56a2847ba62 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -36,7 +36,7 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -76,6 +76,7 @@ export { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecond export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder } from './builder/timestamp.js'; export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; export { Utf8Builder } from './builder/utf8.js'; +export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; export { ListBuilder } from './builder/list.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; diff --git a/js/src/builder.ts b/js/src/builder.ts index 90fe3ddcc9477..a2f32c712ea42 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -22,7 +22,7 @@ import { DataType, strideForType, Float, Int, Decimal, FixedSizeBinary, Date_, Time, Timestamp, Interval, - Utf8, Binary, List, Map_, + Utf8, LargeUtf8, Binary, List, Map_, } from './type.js'; import { createIsValidFunction } from './builder/valid.js'; import { BufferBuilder, BitmapBufferBuilder, DataBufferBuilder, OffsetsBufferBuilder } from './builder/buffer.js'; @@ -163,6 +163,7 @@ export abstract class Builder { public toVector() { return new Vector([this.flush()]); } public get ArrayType() { return this.type.ArrayType; } + public get OffsetType() { return this.type.OffsetType; } public get nullCount() { return this._nulls.numInvalid; } public get numChildren() { return this.children.length; } @@ -355,13 +356,13 @@ export abstract class FixedWidthBuilder extends Builder { +export abstract class VariableWidthBuilder extends Builder { protected _pendingLength = 0; - protected _offsets: OffsetsBufferBuilder; + protected _offsets: OffsetsBufferBuilder; protected _pending: Map | undefined; constructor(opts: BuilderOptions) { super(opts); - this._offsets = new OffsetsBufferBuilder(); + this._offsets = new OffsetsBufferBuilder(opts.type); } public setValue(index: number, value: T['TValue']) { const pending = this._pending || (this._pending = new Map()); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 03d4f33349a7a..c9ffc255cca7d 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -20,6 +20,7 @@ import { TypedArray, TypedArrayConstructor, BigIntArray, BigIntArrayConstructor } from '../interfaces.js'; +import { DataType } from '../type.js'; /** @ignore */ type DataValue = T extends TypedArray ? number : T extends BigIntArray ? WideValue : T; /** @ignore */ type WideValue = T extends BigIntArray ? bigint | Int32Array | Uint32Array : never; @@ -134,8 +135,8 @@ export class BitmapBufferBuilder extends DataBufferBuilder { } /** @ignore */ -export class OffsetsBufferBuilder extends DataBufferBuilder { - constructor(data = new Int32Array(1)) { super(data, 1); } +export class OffsetsBufferBuilder extends DataBufferBuilder { + constructor(type: T) { super(new type.OffsetType(1), 1); } public append(value: number) { return this.set(this.length - 1, value); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts new file mode 100644 index 0000000000000..002bb8265a73d --- /dev/null +++ b/js/src/builder/largeutf8.ts @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { LargeUtf8 } from '../type.js'; +import { encodeUtf8 } from '../util/utf8.js'; +import { BinaryBuilder } from './binary.js'; +import { BufferBuilder } from './buffer.js'; +import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; + +/** @ignore */ +export class LargeUtf8Builder extends VariableWidthBuilder { + constructor(opts: BuilderOptions) { + super(opts); + this._values = new BufferBuilder(new Uint8Array(0)); + } + public get byteLength(): number { + let size = this._pendingLength + (this.length * 4); + this._offsets && (size += this._offsets.byteLength); + this._values && (size += this._values.byteLength); + this._nulls && (size += this._nulls.byteLength); + return size; + } + public setValue(index: number, value: string) { + return super.setValue(index, encodeUtf8(value) as any); + } + // @ts-ignore + protected _flushPending(pending: Map, pendingLength: number): void { } +} + +(LargeUtf8Builder.prototype as any)._flushPending = (BinaryBuilder.prototype as any)._flushPending; diff --git a/js/src/builder/list.ts b/js/src/builder/list.ts index d83cac8e7b1c6..b2739cd5a3260 100644 --- a/js/src/builder/list.ts +++ b/js/src/builder/list.ts @@ -22,10 +22,10 @@ import { Builder, BuilderOptions, VariableWidthBuilder } from '../builder.js'; /** @ignore */ export class ListBuilder extends VariableWidthBuilder, TNull> { - protected _offsets: OffsetsBufferBuilder; + protected _offsets: OffsetsBufferBuilder>; constructor(opts: BuilderOptions, TNull>) { super(opts); - this._offsets = new OffsetsBufferBuilder(); + this._offsets = new OffsetsBufferBuilder(opts.type); } public addChild(child: Builder, name = '0') { if (this.numChildren > 0) { diff --git a/js/src/data.ts b/js/src/data.ts index dc423cdb01e1c..69fcf17d13212 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -17,7 +17,7 @@ import { Vector } from './vector.js'; import { BufferType, Type, UnionMode } from './enum.js'; -import { DataType, strideForType } from './type.js'; +import { DataType, LargeUtf8, strideForType } from './type.js'; import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; // When slicing, we do not know the null count of the sliced range without @@ -34,7 +34,7 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export interface Buffers { - [BufferType.OFFSET]: Int32Array; + [BufferType.OFFSET]: T['TOffset']; [BufferType.DATA]: T['TArray']; [BufferType.VALIDITY]: Uint8Array; [BufferType.TYPE]: T['TArray']; @@ -306,6 +306,14 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitLargeUtf8(props: LargeUtf8DataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const data = toUint8Array(props['data']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toInt32Array(props['valueOffsets']); + const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); + } public visitBinary(props: BinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -427,6 +435,7 @@ interface IntervalDataProps extends DataProps_ { data?: D interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface LargeUtf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } @@ -449,6 +458,7 @@ export type DataProps = ( T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends Utf8 /* */ ? Utf8DataProps : + T extends LargeUtf8 /* */ ? LargeUtf8DataProps : T extends List /* */ ? ListDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : T extends Struct /* */ ? StructDataProps : diff --git a/js/src/enum.ts b/js/src/enum.ts index f5856bc06afbe..2714b66739d56 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -201,6 +201,7 @@ export enum Type { SparseUnion = -24, IntervalDayTime = -25, IntervalYearMonth = -26, + LargeUtf8 = -27, } export enum BufferType { diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts index 8d61295919046..6ba7912342426 100644 --- a/js/src/interfaces.ts +++ b/js/src/interfaces.ts @@ -32,6 +32,7 @@ import type { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicros import type { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder } from './builder/timestamp.js'; import type { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder } from './builder/interval.js'; import type { Utf8Builder } from './builder/utf8.js'; +import type { LargeUtf8Builder } from './builder/largeutf8.js'; import type { BinaryBuilder } from './builder/binary.js'; import type { ListBuilder } from './builder/list.js'; import type { FixedSizeListBuilder } from './builder/fixedsizelist.js'; @@ -104,7 +105,7 @@ export type BuilderCtorArgs< TArgs extends any[] = any[], TCtor extends new (type: R, ...args: TArgs) => T = new (type: R, ...args: TArgs) => T - > = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; +> = TCtor extends new (type: R, ...args: infer TArgs) => T ? TArgs : never; /** * Obtain the constructor function of an instance type @@ -114,7 +115,7 @@ export type ConstructorType< T, TCtor extends new (...args: any[]) => T = new (...args: any[]) => T - > = TCtor extends new (...args: any[]) => T ? TCtor : never; +> = TCtor extends new (...args: any[]) => T ? TCtor : never; /** @ignore */ export type BuilderCtorType< @@ -122,7 +123,7 @@ export type BuilderCtorType< R extends DataType = any, TCtor extends new (options: BuilderOptions) => T = new (options: BuilderOptions) => T - > = TCtor extends new (options: BuilderOptions) => T ? TCtor : never; +> = TCtor extends new (options: BuilderOptions) => T ? TCtor : never; /** @ignore */ export type BuilderType = @@ -200,6 +201,7 @@ export type TypeToDataType = { [Type.Float64]: type.Float64; [Type.Float]: type.Float; [Type.Utf8]: type.Utf8; + [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; @@ -248,6 +250,7 @@ type TypeToBuilder = { [Type.Float64]: Float64Builder; [Type.Float]: FloatBuilder; [Type.Utf8]: Utf8Builder; + [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; @@ -296,6 +299,7 @@ type DataTypeToBuilder = { [Type.Float64]: T extends type.Float64 ? Float64Builder : never; [Type.Float]: T extends type.Float ? FloatBuilder : never; [Type.Utf8]: T extends type.Utf8 ? Utf8Builder : never; + [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; diff --git a/js/src/type.ts b/js/src/type.ts index 1dc90c47cbd10..b88399d4253bb 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -19,7 +19,7 @@ import { Field } from './schema.js'; import { Vector } from './vector.js'; import { MapRow } from './row/map.js'; import { StructRow, StructRowProxy } from './row/struct.js'; -import { TypedArrayConstructor } from './interfaces.js'; +import { BigIntArrayConstructor, TypedArrayConstructor } from './interfaces.js'; import { bigIntToNumber } from './util/bigint.js'; import { @@ -38,9 +38,11 @@ export type IsSigned = { 'true': true; 'false': false }; export interface DataType { readonly TType: TType; readonly TArray: any; + readonly TOffset: any; readonly TValue: any; readonly TChildren: TChildren; readonly ArrayType: any; + readonly OffsetType: TypedArrayConstructor | BigIntArrayConstructor; readonly children: Field[]; } @@ -57,6 +59,7 @@ export abstract class DataType { (proto).children = null; (proto).ArrayType = Array; + (proto).OffsetType = Array; return proto[Symbol.toStringTag] = 'DataType'; })(DataType.prototype); } @@ -246,7 +250,7 @@ export class Binary extends DataType { } /** @ignore */ -export interface Utf8 extends DataType { TArray: Uint8Array; TValue: string; ArrayType: TypedArrayConstructor } +export interface Utf8 extends DataType { TArray: Uint8Array; TOffset: Uint32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetType: TypedArrayConstructor } /** @ignore */ export class Utf8 extends DataType { constructor() { @@ -256,10 +260,27 @@ export class Utf8 extends DataType { public toString() { return `Utf8`; } protected static [Symbol.toStringTag] = ((proto: Utf8) => { (proto).ArrayType = Uint8Array; + (proto).OffsetType = Uint32Array; return proto[Symbol.toStringTag] = 'Utf8'; })(Utf8.prototype); } +/** @ignore */ +export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffset: BigUint64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetType: BigIntArrayConstructor } +/** @ignore */ +export class LargeUtf8 extends DataType { + constructor() { + super(); + } + public get typeId() { return Type.LargeUtf8 as Type.LargeUtf8; } + public toString() { return `LargeUtf8`; } + protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { + (proto).ArrayType = Uint8Array; + (proto).OffsetType = BigUint64Array; + return proto[Symbol.toStringTag] = 'LargeUtf8'; + })(LargeUtf8.prototype); +} + /** @ignore */ export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor } /** @ignore */ @@ -547,6 +568,7 @@ export class FixedSizeBinary extends DataType { protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; (proto).ArrayType = Uint8Array; + (proto).OffsetType = Uint32Array; return proto[Symbol.toStringTag] = 'FixedSizeBinary'; })(FixedSizeBinary.prototype); } diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 12f8325470bac..84e97180016f3 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -108,13 +108,13 @@ function wrapGet(fn: (data: Data, _1: any) => any) { /** @ignore */ const getNull = (_data: Data, _index: number): T['TValue'] => null; /** @ignore */ -const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number) => { +const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Uint32Array | BigUint64Array, index: number) => { if (index + 1 >= valueOffsets.length) { return null as any; } const x = valueOffsets[index]; const y = valueOffsets[index + 1]; - return values.subarray(x, y); + return values.subarray(Number(x), Number(y)); }; /** @ignore */ diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 7a617f4afe2c4..5c4897071abd2 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -27,7 +27,7 @@ import { BitIterator, getBit, getBool } from '../util/bit.js'; import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, LargeUtf8, } from '../type.js'; /** @ignore */ @@ -99,6 +99,9 @@ export class JSONVectorAssembler extends Visitor { public visitUtf8(data: Data) { return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; } + public visitLargeUtf8(data: Data) { + return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; + } public visitBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))], OFFSET: [...data.valueOffsets] }; } diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index c2d4319911afe..e356d524fa085 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -25,7 +25,7 @@ import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -57,6 +57,7 @@ export interface SetVisitor extends Visitor { visitFloat32(data: Data, index: number, value: T['TValue']): void; visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; + visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; @@ -117,10 +118,10 @@ export const setEpochMsToNanosecondsLong = (data: Int32Array, index: number, epo }; /** @ignore */ -export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number, value: Uint8Array) => { +export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: Uint32Array | BigUint64Array, index: number, value: Uint8Array) => { if (index + 1 < valueOffsets.length) { - const { [index]: x, [index + 1]: y } = valueOffsets; - values.set(value.subarray(0, y - x), x); + const { [index]: x, [index + 1]: y } = valueOffsets as BigUint64Array; + values.set(value.subarray(0, Number(y - x)), Number(x)); } }; @@ -158,7 +159,7 @@ export const setFixedSizeBinary = ({ stride, values } /** @ignore */ const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ -const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { +const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); }; @@ -339,6 +340,7 @@ SetVisitor.prototype.visitFloat16 = wrapSet(setFloat16); SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); diff --git a/js/src/visitor/typeassembler.ts b/js/src/visitor/typeassembler.ts index c84e3930f64f5..7594bc3a4035f 100644 --- a/js/src/visitor/typeassembler.ts +++ b/js/src/visitor/typeassembler.ts @@ -27,6 +27,7 @@ import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; +import { LargeUtf8 } from '../fb/large-utf8.js'; import { Decimal } from '../fb/decimal.js'; import { Date } from '../fb/date.js'; import { Time } from '../fb/time.js'; @@ -77,6 +78,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitLargeUtf8(_node: T, b: Builder) { + LargeUtf8.startLargeUtf8(b); + return LargeUtf8.endLargeUtf8(b); + } public visitDecimal(node: T, b: Builder) { Decimal.startDecimal(b); Decimal.addScale(b, node.scale); diff --git a/js/src/visitor/typecomparator.ts b/js/src/visitor/typecomparator.ts index a77c4020961ce..777beca80f296 100644 --- a/js/src/visitor/typecomparator.ts +++ b/js/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -52,6 +52,7 @@ export interface TypeComparator extends Visitor { visitFloat32(type: T, other?: DataType | null): other is T; visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; + visitLargeUtf8(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; @@ -236,6 +237,7 @@ TypeComparator.prototype.visitFloat16 = compareFloat; TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; +TypeComparator.prototype.visitLargeUtf8 = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; diff --git a/js/src/visitor/typector.ts b/js/src/visitor/typector.ts index c825a61dbadfb..be98bf63e831b 100644 --- a/js/src/visitor/typector.ts +++ b/js/src/visitor/typector.ts @@ -49,6 +49,7 @@ export class GetDataTypeConstructor extends Visitor { public visitFloat32() { return type.Float32; } public visitFloat64() { return type.Float64; } public visitUtf8() { return type.Utf8; } + public visitLargeUtf8() { return type.LargeUtf8; } public visitBinary() { return type.Binary; } public visitFixedSizeBinary() { return type.FixedSizeBinary; } public visitDate() { return type.Date_; } From 3b2c851b9d57de28244541e685e7bbbfebca7143 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 26 May 2023 11:48:12 +0200 Subject: [PATCH 02/37] Switch back to int types --- js/src/builder.ts | 2 +- js/src/builder/buffer.ts | 6 +++--- js/src/type.ts | 10 +++++----- js/src/visitor/get.ts | 2 +- js/src/visitor/set.ts | 5 +++-- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index a2f32c712ea42..63b1e6e6d24d3 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -290,7 +290,7 @@ export abstract class Builder { valueOffsets = _offsets?.flush(length); } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists // Binary, Utf8 - data = _values?.flush(_offsets.last()); + data = _values?.flush(Number(_offsets.last())); } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, and Interval) data = _values?.flush(length); } diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index c9ffc255cca7d..6d70934102de5 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -101,7 +101,7 @@ export class BufferBuilder extends BufferBuilder { +export class DataBufferBuilder extends BufferBuilder { public last() { return this.get(this.length - 1); } public get(index: number) { return this.buffer[index]; } public set(index: number, value: number) { @@ -137,10 +137,10 @@ export class BitmapBufferBuilder extends DataBufferBuilder { /** @ignore */ export class OffsetsBufferBuilder extends DataBufferBuilder { constructor(type: T) { super(new type.OffsetType(1), 1); } - public append(value: number) { + public append(value: number | bigint) { return this.set(this.length - 1, value); } - public set(index: number, value: number) { + public set(index: number, value: number | bigint) { const offset = this.length - 1; const buffer = this.reserve(index - offset + 1).buffer; if (offset < index++) { diff --git a/js/src/type.ts b/js/src/type.ts index b88399d4253bb..d7b5bc51cd7b6 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -42,7 +42,7 @@ export interface DataType | BigIntArrayConstructor; + readonly OffsetType: TypedArrayConstructor | BigIntArrayConstructor; readonly children: Field[]; } @@ -250,7 +250,7 @@ export class Binary extends DataType { } /** @ignore */ -export interface Utf8 extends DataType { TArray: Uint8Array; TOffset: Uint32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetType: TypedArrayConstructor } +export interface Utf8 extends DataType { TArray: Uint8Array; TOffset: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetType: TypedArrayConstructor } /** @ignore */ export class Utf8 extends DataType { constructor() { @@ -260,13 +260,13 @@ export class Utf8 extends DataType { public toString() { return `Utf8`; } protected static [Symbol.toStringTag] = ((proto: Utf8) => { (proto).ArrayType = Uint8Array; - (proto).OffsetType = Uint32Array; + (proto).OffsetType = Int32Array; return proto[Symbol.toStringTag] = 'Utf8'; })(Utf8.prototype); } /** @ignore */ -export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffset: BigUint64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetType: BigIntArrayConstructor } +export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffset: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetType: BigIntArrayConstructor } /** @ignore */ export class LargeUtf8 extends DataType { constructor() { @@ -276,7 +276,7 @@ export class LargeUtf8 extends DataType { public toString() { return `LargeUtf8`; } protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { (proto).ArrayType = Uint8Array; - (proto).OffsetType = BigUint64Array; + (proto).OffsetType = BigInt64Array; return proto[Symbol.toStringTag] = 'LargeUtf8'; })(LargeUtf8.prototype); } diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 84e97180016f3..bc30668fe3d0a 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -108,7 +108,7 @@ function wrapGet(fn: (data: Data, _1: any) => any) { /** @ignore */ const getNull = (_data: Data, _index: number): T['TValue'] => null; /** @ignore */ -const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Uint32Array | BigUint64Array, index: number) => { +const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array | BigInt64Array, index: number) => { if (index + 1 >= valueOffsets.length) { return null as any; } diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index e356d524fa085..77696492ec60f 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -118,9 +118,10 @@ export const setEpochMsToNanosecondsLong = (data: Int32Array, index: number, epo }; /** @ignore */ -export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: Uint32Array | BigUint64Array, index: number, value: Uint8Array) => { +export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { if (index + 1 < valueOffsets.length) { - const { [index]: x, [index + 1]: y } = valueOffsets as BigUint64Array; + const x = valueOffsets[index] as T extends Int32Array ? number : bigint; + const y = valueOffsets[index + 1] as T extends Int32Array ? number : bigint; values.set(value.subarray(0, Number(y - x)), Number(x)); } }; From 9562d08d7119ff407bf44cb787ced1aa6dc0c8d0 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sun, 11 Jun 2023 14:23:06 -0400 Subject: [PATCH 03/37] Adding more largeUtf8 stuff --- js/src/data.ts | 5 +- js/src/visitor.ts | 1 + js/src/visitor/builderctor.ts | 2 + js/src/visitor/get.ts | 3 +- js/src/visitor/indexof.ts | 3 +- js/src/visitor/iterator.ts | 3 +- js/src/visitor/jsontypeassembler.ts | 3 + js/src/visitor/jsonvectorassembler.ts | 1 + js/src/visitor/vectorassembler.ts | 1 + js/test/generate-test-data.ts | 79 ++++++++++++++++++++------- js/test/unit/visitor-tests.ts | 6 +- 11 files changed, 80 insertions(+), 27 deletions(-) diff --git a/js/src/data.ts b/js/src/data.ts index 69fcf17d13212..773c736fb17b2 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -29,7 +29,7 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export type NullBuffer = Uint8Array | null | undefined; /** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable | undefined; -/** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable | undefined; +/** @ignore */ export type ValueOffsetsBuffer = Int32Array | BigInt64Array | ArrayLike | Iterable | undefined; /** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable | undefined; /** @ignore */ @@ -435,7 +435,7 @@ interface IntervalDataProps extends DataProps_ { data?: D interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } -interface LargeUtf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface LargeUtf8DataProps extends DataProps_ { valueOffsets: BigInt64Array; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } @@ -484,6 +484,7 @@ export function makeData(props: IntervalDataProps): Data< export function makeData(props: FixedSizeBinaryDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; +export function makeData(props: LargeUtf8DataProps): Data; export function makeData(props: ListDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; export function makeData(props: StructDataProps): Data; diff --git a/js/src/visitor.ts b/js/src/visitor.ts index 3be50a6d3eacf..a923d025506fd 100644 --- a/js/src/visitor.ts +++ b/js/src/visitor.ts @@ -36,6 +36,7 @@ export abstract class Visitor { public visitInt(_node: any, ..._args: any[]): any { return null; } public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } + public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } diff --git a/js/src/visitor/builderctor.ts b/js/src/visitor/builderctor.ts index 9ce9ae4d4a797..5b9af82a2a06d 100644 --- a/js/src/visitor/builderctor.ts +++ b/js/src/visitor/builderctor.ts @@ -39,6 +39,7 @@ import { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder } from '../builder/time.js'; import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; +import { LargeUtf8Builder } from '../builder/largeutf8.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -66,6 +67,7 @@ export class GetBuilderCtor extends Visitor { public visitFloat32() { return Float32Builder; } public visitFloat64() { return Float64Builder; } public visitUtf8() { return Utf8Builder; } + public visitLargeUtf8() { return LargeUtf8Builder; } public visitBinary() { return BinaryBuilder; } public visitFixedSizeBinary() { return FixedSizeBinaryBuilder; } public visitDate() { return DateBuilder; } diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index bc30668fe3d0a..83dc4dbce97be 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -34,7 +34,7 @@ import { Interval, IntervalDayTime, IntervalYearMonth, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, - Union, DenseUnion, SparseUnion, + Union, DenseUnion, SparseUnion, LargeUtf8, } from '../type.js'; /** @ignore */ @@ -59,6 +59,7 @@ export interface GetVisitor extends Visitor { visitFloat32(data: Data, index: number): T['TValue'] | null; visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; + visitLargeUtf8(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 654134c6dff04..5357aa44fbd4c 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -56,6 +56,7 @@ export interface IndexOfVisitor extends Visitor { visitFloat32(data: Data, value: T['TValue'] | null, index?: number): number; visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; diff --git a/js/src/visitor/iterator.ts b/js/src/visitor/iterator.ts index 48021a78e86f6..b2fa9eaca82db 100644 --- a/js/src/visitor/iterator.ts +++ b/js/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -54,6 +54,7 @@ export interface IteratorVisitor extends Visitor { visitFloat32(vector: Vector): IterableIterator; visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; + visitLargeUtf8(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; diff --git a/js/src/visitor/jsontypeassembler.ts b/js/src/visitor/jsontypeassembler.ts index d83edfc24fbd8..f3da67cdf316e 100644 --- a/js/src/visitor/jsontypeassembler.ts +++ b/js/src/visitor/jsontypeassembler.ts @@ -48,6 +48,9 @@ export class JSONTypeAssembler extends Visitor { public visitUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitLargeUtf8({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitDecimal({ typeId, scale, precision, bitWidth }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'scale': scale, 'precision': precision, 'bitWidth': bitWidth }; } diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index 5c4897071abd2..316aef1cb7db9 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -42,6 +42,7 @@ export interface JSONVectorAssembler extends Visitor { visitInt(data: Data): { DATA: number[] | string[] }; visitFloat(data: Data): { DATA: number[] }; visitUtf8(data: Data): { DATA: string[]; OFFSET: number[] }; + visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: bigint[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index dbf778c4c3631..f45d6246f3731 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -233,6 +233,7 @@ VectorAssembler.prototype.visitBool = assembleBoolVector; VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index a03b22c54c770..4137e2d0ba535 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -73,6 +73,7 @@ interface TestDataVectorGenerator extends Visitor { visitUint64: typeof generateBigInt; visitFloat: typeof generateFloat; visitUtf8: typeof generateUtf8; + visitLargeUtf8: typeof generateLargeUtf8; visitBinary: typeof generateBinary; visitFixedSizeBinary: typeof generateFixedSizeBinary; visitDate: typeof generateDate; @@ -304,7 +305,7 @@ function generateFloat(this: TestDataVectorGenerator, type: T, function generateUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); const values: string[] = new Array(valueOffsets.length - 1).fill(null); [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) @@ -324,9 +325,31 @@ function generateUtf8(this: TestDataVectorGenerator, type: T, le return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateLargeUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const valueOffsets = createVariableWidthOffsets64(length, nullBitmap, 10, 20, nullCount != 0); + const values: string[] = new Array(valueOffsets.length - 1).fill(null); + [...valueOffsets.slice(1)] + .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) + .reduce((map, length, i) => { + if (length !== null) { + if (length > 0) { + do { + values[i] = randomString(Number(length)); + } while (map.has(values[i])); + return map.set(values[i], i); + } + values[i] = ''; + } + return map; + }, new Map()); + const data = createVariableWidthBytes(length, nullBitmap, valueOffsets, (i) => encodeUtf8(values[i])); + return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; +} + function generateBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, 10, 20, nullCount != 0); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); const values = [...valueOffsets.slice(1)] .map((o, i) => isValid(nullBitmap, i) ? o - valueOffsets[i] : null) .map((length) => length == null ? null : randomBytes(length)); @@ -425,7 +448,7 @@ function generateList(this: TestDataVectorGenerator, type: T, le const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues = child.values(); const values: (T['valueType'] | null)[] = [...valueOffsets.slice(1)] @@ -563,7 +586,7 @@ function generateMap(this: TestDataVectorGenerator, const childVec = child.vector; const nullBitmap = createBitmap(length, nullCount); const stride = childVec.length / (length - nullCount); - const valueOffsets = createVariableWidthOffsets(length, nullBitmap, stride, stride); + const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, stride, stride); const values = memoize(() => { const childValues: { key: K; value: V }[] = child.values(); const values: (Record | null)[] = [...valueOffsets.slice(1)] @@ -642,24 +665,38 @@ function createBitmap(length: number, nullCount: number) { return bytes; } -function createVariableWidthOffsets(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { - const offsets = new Int32Array(length + 1); - iterateBitmap(length, nullBitmap, (i, valid) => { - if (!valid) { - offsets[i + 1] = offsets[i]; - } else { - do { - offsets[i + 1] = offsets[i] + Math.min(max, Math.max(min, Math.trunc(rand() * max))); - } while (!allowEmpty && offsets[i + 1] === offsets[i]); - } - }); - return offsets; -} - -function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array, getBytes: (index: number) => Uint8Array) { - const bytes = new Uint8Array(offsets[length]); +function createVariableWidthOffsets32(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { + const offsets = new Int32Array(length + 1); + iterateBitmap(length, nullBitmap, (i, valid) => { + if (!valid) { + offsets[i + 1] = offsets[i]; + } else { + do { + offsets[i + 1] = offsets[i] + Math.min(max, Math.max(min, Math.trunc(rand() * max))); + } while (!allowEmpty && offsets[i + 1] === offsets[i]); + } + }); + return offsets; +} + +function createVariableWidthOffsets64(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { + const offsets = new BigInt64Array(length + 1); + iterateBitmap(length, nullBitmap, (i, valid) => { + if (!valid) { + offsets[i + 1] = offsets[i]; + } else { + do { + offsets[i + 1] = offsets[i] + BigInt(Math.min(max, Math.max(min, Math.trunc(rand() * max)))); + } while (!allowEmpty && offsets[i + 1] === offsets[i]); + } + }); + return offsets; +} + +function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array | BigInt64Array, getBytes: (index: number) => Uint8Array) { + const bytes = new Uint8Array(Number(offsets[length])); iterateBitmap(length, nullBitmap, (i, valid) => { - valid && bytes.set(getBytes(i), offsets[i]); + valid && bytes.set(getBytes(i), Number(offsets[i])); }); return bytes; } diff --git a/js/test/unit/visitor-tests.ts b/js/test/unit/visitor-tests.ts index 645fcc60f8d90..2b1cfb4a6d80b 100644 --- a/js/test/unit/visitor-tests.ts +++ b/js/test/unit/visitor-tests.ts @@ -18,7 +18,7 @@ import { Field, Visitor, DataType, Dictionary, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -35,6 +35,7 @@ class BasicVisitor extends Visitor { public visitInt(type: T) { return (this.type = type); } public visitFloat(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDate(type: T) { return (this.type = type); } @@ -66,6 +67,7 @@ class FeatureVisitor extends Visitor { public visitFloat32(type: T) { return (this.type = type); } public visitFloat64(type: T) { return (this.type = type); } public visitUtf8(type: T) { return (this.type = type); } + public visitLargeUtf8(type: T) { return (this.type = type); } public visitBinary(type: T) { return (this.type = type); } public visitFixedSizeBinary(type: T) { return (this.type = type); } public visitDateDay(type: T) { return (this.type = type); } @@ -98,6 +100,7 @@ describe('Visitor', () => { test(`visits Int types`, () => validateBasicVisitor(new Int(true, 32))); test(`visits Float types`, () => validateBasicVisitor(new Float(0))); test(`visits Utf8 types`, () => validateBasicVisitor(new Utf8())); + test(`visits LargeUtf8 types`, () => validateBasicVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateBasicVisitor(new Binary())); test(`visits FixedSizeBinary types`, () => validateBasicVisitor(new FixedSizeBinary(128))); test(`visits Date types`, () => validateBasicVisitor(new Date_(0))); @@ -137,6 +140,7 @@ describe('Visitor', () => { test(`visits Float32 types`, () => validateFeatureVisitor(new Float32())); test(`visits Float64 types`, () => validateFeatureVisitor(new Float64())); test(`visits Utf8 types`, () => validateFeatureVisitor(new Utf8())); + test(`visits LargeUtf8 types`, () => validateFeatureVisitor(new LargeUtf8())); test(`visits Binary types`, () => validateFeatureVisitor(new Binary())); test(`visits FixedSizeBinary types`, () => validateFeatureVisitor(new FixedSizeBinary(128))); test(`visits DateDay types`, () => validateFeatureVisitor(new DateDay())); From 775370a952196b6cfd97eab4cfc7ab78940849f7 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sun, 11 Jun 2023 14:29:25 -0400 Subject: [PATCH 04/37] Fix remaining type issues --- js/src/builder.ts | 2 +- js/src/visitor.ts | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index 63b1e6e6d24d3..c28c4065f1797 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -199,7 +199,7 @@ export abstract class Builder { return this.children.reduce((size, child) => size + child.reservedByteLength, size); } - declare protected _offsets: DataBufferBuilder; + declare protected _offsets: DataBufferBuilder; public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } declare protected _values: BufferBuilder; diff --git a/js/src/visitor.ts b/js/src/visitor.ts index a923d025506fd..e4361180cb0ff 100644 --- a/js/src/visitor.ts +++ b/js/src/visitor.ts @@ -89,6 +89,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float32: fn = visitor.visitFloat32 || visitor.visitFloat; break; case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; + case Type.LargeUtf8: fn = visitor.visitLargeUtf8 || visitor.visitUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; @@ -147,6 +148,7 @@ function inferDType(type: T): Type { return Type.Float; case Type.Binary: return Type.Binary; case Type.Utf8: return Type.Utf8; + case Type.LargeUtf8: return Type.LargeUtf8; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; case Type.Time: @@ -215,6 +217,7 @@ export interface Visitor { visitFloat32?(node: any, ...args: any[]): any; visitFloat64?(node: any, ...args: any[]): any; visitUtf8(node: any, ...args: any[]): any; + visitLargeUtf8(node: any, ...args: any[]): any; visitBinary(node: any, ...args: any[]): any; visitFixedSizeBinary(node: any, ...args: any[]): any; visitDate(node: any, ...args: any[]): any; From 46ef8046b35931a01f728abea4b8bbe0f7c7e7f8 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sun, 11 Jun 2023 15:26:27 -0400 Subject: [PATCH 05/37] Update comment --- js/src/builder.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index c28c4065f1797..3531e24738d7c 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -289,7 +289,7 @@ export abstract class Builder { // DenseUnions valueOffsets = _offsets?.flush(length); } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists - // Binary, Utf8 + // Binary, Utf8, LargeUtf8 data = _values?.flush(Number(_offsets.last())); } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, and Interval) data = _values?.flush(length); From e11b0785310e35837f830502392b4b684571cc0d Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sun, 11 Jun 2023 22:24:58 -0400 Subject: [PATCH 06/37] Fix --- js/src/visitor.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/visitor.ts b/js/src/visitor.ts index e4361180cb0ff..4652aad0cb895 100644 --- a/js/src/visitor.ts +++ b/js/src/visitor.ts @@ -89,7 +89,7 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float32: fn = visitor.visitFloat32 || visitor.visitFloat; break; case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; - case Type.LargeUtf8: fn = visitor.visitLargeUtf8 || visitor.visitUtf8; break; + case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; From 3e91639fb61c591c5fc4cc53dfedfd79f7b80838 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Mon, 19 Jun 2023 22:06:41 -0700 Subject: [PATCH 07/37] Correct and refine types --- js/src/builder.ts | 2 +- js/src/builder/buffer.ts | 20 +++++++++----------- js/src/type.ts | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index 3531e24738d7c..b9bcc443c3f1b 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -202,7 +202,7 @@ export abstract class Builder { declare protected _offsets: DataBufferBuilder; public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } - declare protected _values: BufferBuilder; + declare protected _values: BufferBuilder; public get values() { return this._values ? this._values.buffer : null; } declare protected _nulls: BitmapBufferBuilder; diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 6d70934102de5..7fe9c39c6da3b 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -22,8 +22,6 @@ import { } from '../interfaces.js'; import { DataType } from '../type.js'; -/** @ignore */ type DataValue = T extends TypedArray ? number : T extends BigIntArray ? WideValue : T; -/** @ignore */ type WideValue = T extends BigIntArray ? bigint | Int32Array | Uint32Array : never; /** @ignore */ type ArrayCtor = T extends TypedArray ? TypedArrayConstructor : T extends BigIntArray ? BigIntArrayConstructor : @@ -37,12 +35,12 @@ const sliceOrExtendArray = (arr: T, len = 0) ) as T; /** @ignore */ -export interface BufferBuilder> { +export interface BufferBuilder { readonly offset: number; } /** @ignore */ -export class BufferBuilder> { +export class BufferBuilder { constructor(buffer: T, stride = 1) { this.buffer = buffer; @@ -65,8 +63,8 @@ export class BufferBuilder 0) { this.length += extra; @@ -101,10 +99,10 @@ export class BufferBuilder extends BufferBuilder { +export class DataBufferBuilder extends BufferBuilder { public last() { return this.get(this.length - 1); } - public get(index: number) { return this.buffer[index]; } - public set(index: number, value: number) { + public get(index: number): T[0] { return this.buffer[index]; } + public set(index: number, value: T[0]) { this.reserve(index - this.length + 1); this.buffer[index * this.stride] = value; return this; @@ -137,10 +135,10 @@ export class BitmapBufferBuilder extends DataBufferBuilder { /** @ignore */ export class OffsetsBufferBuilder extends DataBufferBuilder { constructor(type: T) { super(new type.OffsetType(1), 1); } - public append(value: number | bigint) { + public append(value: T['TOffset'][0]) { return this.set(this.length - 1, value); } - public set(index: number, value: number | bigint) { + public set(index: number, value: T['TOffset'][0]) { const offset = this.length - 1; const buffer = this.reserve(index - offset + 1).buffer; if (offset < index++) { diff --git a/js/src/type.ts b/js/src/type.ts index d7b5bc51cd7b6..bd20ee7e9b582 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -568,7 +568,7 @@ export class FixedSizeBinary extends DataType { protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; (proto).ArrayType = Uint8Array; - (proto).OffsetType = Uint32Array; + (proto).OffsetType = Int32Array; return proto[Symbol.toStringTag] = 'FixedSizeBinary'; })(FixedSizeBinary.prototype); } From 564a3e8e74c7ddc406d9e4cad45c3896200fee60 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Mon, 19 Jun 2023 22:50:37 -0700 Subject: [PATCH 08/37] More refined types --- js/src/builder.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index b9bcc443c3f1b..fa0e45c3a9f18 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -199,7 +199,7 @@ export abstract class Builder { return this.children.reduce((size, child) => size + child.reservedByteLength, size); } - declare protected _offsets: DataBufferBuilder; + declare protected _offsets: DataBufferBuilder; public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } declare protected _values: BufferBuilder; From b73a1f46b220480b1c635f1ea32f83994952a2bf Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Mon, 19 Jun 2023 22:51:18 -0700 Subject: [PATCH 09/37] Move ArrayCtor type --- js/src/builder/buffer.ts | 10 +--------- js/src/interfaces.ts | 6 ++++++ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 7fe9c39c6da3b..4c249ef6c15aa 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -16,17 +16,9 @@ // under the License. import { memcpy } from '../util/buffer.js'; -import { - TypedArray, TypedArrayConstructor, - BigIntArray, BigIntArrayConstructor -} from '../interfaces.js'; +import { TypedArray, BigIntArray, ArrayCtor } from '../interfaces.js'; import { DataType } from '../type.js'; -/** @ignore */ type ArrayCtor = - T extends TypedArray ? TypedArrayConstructor : - T extends BigIntArray ? BigIntArrayConstructor : - any; - /** @ignore */ const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => ((((Math.ceil(len) * BPE) + 63) & ~63) || 64) / BPE; /** @ignore */ diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts index 6ba7912342426..f6e16eafbb236 100644 --- a/js/src/interfaces.ts +++ b/js/src/interfaces.ts @@ -98,6 +98,12 @@ export interface BigIntArrayConstructor { from(arrayLike: ArrayLike, mapfn: (v: U, k: number) => bigint, thisArg?: any): T; } +/** @ignore */ +export type ArrayCtor = + T extends TypedArray ? TypedArrayConstructor : + T extends BigIntArray ? BigIntArrayConstructor : + any; + /** @ignore */ export type BuilderCtorArgs< T extends BuilderType, From 3f9359b60405af5bbddc55417706447dba926896 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Mon, 19 Jun 2023 23:05:10 -0700 Subject: [PATCH 10/37] Change default offset to Int32Array --- js/src/type.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/js/src/type.ts b/js/src/type.ts index bd20ee7e9b582..27bfaabdf6035 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -82,7 +82,7 @@ export abstract class DataType { (proto).children = null; (proto).ArrayType = Array; - (proto).OffsetType = Array; + (proto).OffsetType = Int32Array; return proto[Symbol.toStringTag] = 'DataType'; })(DataType.prototype); } @@ -260,7 +260,6 @@ export class Utf8 extends DataType { public toString() { return `Utf8`; } protected static [Symbol.toStringTag] = ((proto: Utf8) => { (proto).ArrayType = Uint8Array; - (proto).OffsetType = Int32Array; return proto[Symbol.toStringTag] = 'Utf8'; })(Utf8.prototype); } From 263ce42f73dda1c324120b9eb7e8e46068779101 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Mon, 19 Jun 2023 23:28:33 -0700 Subject: [PATCH 11/37] Test large utf8 builder --- js/test/generate-test-data.ts | 4 +++- js/test/unit/builders/builder-tests.ts | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 4137e2d0ba535..0bdde3c617987 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -24,7 +24,7 @@ import { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, + Utf8, LargeUtf8, Binary, FixedSizeBinary, Date_, DateDay, DateMillisecond, @@ -98,6 +98,7 @@ TestDataVectorGenerator.prototype.visitInt64 = generateBigInt; TestDataVectorGenerator.prototype.visitUint64 = generateBigInt; TestDataVectorGenerator.prototype.visitFloat = generateFloat; TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; +TestDataVectorGenerator.prototype.visitLargeUtf8 = generateLargeUtf8; TestDataVectorGenerator.prototype.visitBinary = generateBinary; TestDataVectorGenerator.prototype.visitFixedSizeBinary = generateFixedSizeBinary; TestDataVectorGenerator.prototype.visitDate = generateDate; @@ -211,6 +212,7 @@ export const float16 = (length = 100, nullCount = Math.trunc(length * 0.2)) => v export const float32 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float32(), length, nullCount); export const float64 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float64(), length, nullCount); export const utf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8(), length, nullCount); +export const largeUtf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeUtf8(), length, nullCount); export const binary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Binary(), length, nullCount); export const fixedSizeBinary = (length = 100, nullCount = Math.trunc(length * 0.2), byteWidth = 8) => vectorGenerator.visit(new FixedSizeBinary(byteWidth), length, nullCount); export const dateDay = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateDay(), length, nullCount); diff --git a/js/test/unit/builders/builder-tests.ts b/js/test/unit/builders/builder-tests.ts index a73183a7a5d47..1648b8b1b2d2f 100644 --- a/js/test/unit/builders/builder-tests.ts +++ b/js/test/unit/builders/builder-tests.ts @@ -44,6 +44,7 @@ describe('Generated Test Data', () => { describe('Float32Builder', () => { validateBuilder(generate.float32); }); describe('Float64Builder', () => { validateBuilder(generate.float64); }); describe('Utf8Builder', () => { validateBuilder(generate.utf8); }); + describe('LargeUtf8Builder', () => { validateBuilder(generate.largeUtf8); }); describe('BinaryBuilder', () => { validateBuilder(generate.binary); }); describe('FixedSizeBinaryBuilder', () => { validateBuilder(generate.fixedSizeBinary); }); describe('DateDayBuilder', () => { validateBuilder(generate.dateDay); }); From 95410d0807620e977ea746260757b4c5d80d7fc3 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Tue, 20 Jun 2023 16:51:14 -0700 Subject: [PATCH 12/37] Fix support for bigints and add missing visitors --- js/src/builder/binary.ts | 4 ++-- js/src/builder/buffer.ts | 14 ++++++++++++-- js/src/visitor/get.ts | 6 ++++++ js/src/visitor/iterator.ts | 1 + 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/js/src/builder/binary.ts b/js/src/builder/binary.ts index 3c12ddf34abb0..bcdefab9d3e66 100644 --- a/js/src/builder/binary.ts +++ b/js/src/builder/binary.ts @@ -42,11 +42,11 @@ export class BinaryBuilder extends VariableWidthBuilder ((((Math.ceil(len) * BPE) + 63) & ~63) || 64) / BPE; @@ -126,7 +127,16 @@ export class BitmapBufferBuilder extends DataBufferBuilder { /** @ignore */ export class OffsetsBufferBuilder extends DataBufferBuilder { - constructor(type: T) { super(new type.OffsetType(1), 1); } + constructor(type: T) { + super(new type.OffsetType(1), 1); + this.toNumber = type.OffsetType === BigInt64Array ? BigInt : bigIntToNumber as any; + } + + /** + * The correct number constructor for the buffer type. + */ + public toNumber: ((number: number | bigint) => T['TOffset'] extends BigInt64Array ? bigint : number); + public append(value: T['TOffset'][0]) { return this.set(this.length - 1, value); } @@ -141,7 +151,7 @@ export class OffsetsBufferBuilder extends DataBufferBuilder< } public flush(length = this.length - 1) { if (length > this.length) { - this.set(length - 1, 0); + this.set(length - 1, this.toNumber(0)); } return super.flush(length + 1); } diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 83dc4dbce97be..8f93204abe61e 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -150,6 +150,11 @@ const getUtf8 = ({ values, valueOffsets }: Data, index: numbe const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; +/** @ignore */ +const getLargeUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { + const bytes = getVariableWidthBytes(values, valueOffsets, index); + return bytes !== null ? decodeUtf8(bytes) : null as any; +}; /* istanbul ignore next */ /** @ignore */ @@ -304,6 +309,7 @@ GetVisitor.prototype.visitFloat16 = wrapGet(getFloat16); GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitLargeUtf8 = wrapGet(getLargeUtf8); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); diff --git a/js/src/visitor/iterator.ts b/js/src/visitor/iterator.ts index b2fa9eaca82db..fd87297e6f36e 100644 --- a/js/src/visitor/iterator.ts +++ b/js/src/visitor/iterator.ts @@ -153,6 +153,7 @@ IteratorVisitor.prototype.visitFloat16 = vectorIterator; IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; +IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; From 08533536827195f8f84ebf9516a805ee7aa3b1a9 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Tue, 20 Jun 2023 17:23:44 -0700 Subject: [PATCH 13/37] Fix data generation --- js/src/data.ts | 2 +- js/test/generate-test-data.ts | 2 +- js/test/unit/generated-data-tests.ts | 1 + js/test/unit/generated-data-validators.ts | 20 +++++++++++++++----- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/js/src/data.ts b/js/src/data.ts index 773c736fb17b2..62631d0d9e8e7 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -29,7 +29,7 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export type NullBuffer = Uint8Array | null | undefined; /** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable | undefined; -/** @ignore */ export type ValueOffsetsBuffer = Int32Array | BigInt64Array | ArrayLike | Iterable | undefined; +/** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable | undefined; /** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable | undefined; /** @ignore */ diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 0bdde3c617987..4e0df5c98446f 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -327,7 +327,7 @@ function generateUtf8(this: TestDataVectorGenerator, type: T, le return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } -function generateLargeUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { +function generateLargeUtf8(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); const valueOffsets = createVariableWidthOffsets64(length, nullBitmap, 10, 20, nullCount != 0); const values: string[] = new Array(valueOffsets.length - 1).fill(null); diff --git a/js/test/unit/generated-data-tests.ts b/js/test/unit/generated-data-tests.ts index 90cf0d598aa6f..7eaf526d3e1aa 100644 --- a/js/test/unit/generated-data-tests.ts +++ b/js/test/unit/generated-data-tests.ts @@ -38,6 +38,7 @@ describe('Generated Test Data', () => { describe('Float32', () => { validateVector(generate.float32()); }); describe('Float64', () => { validateVector(generate.float64()); }); describe('Utf8', () => { validateVector(generate.utf8()); }); + describe('LargeUtf8', () => { validateVector(generate.largeUtf8()); }); describe('Binary', () => { validateVector(generate.binary()); }); describe('FixedSizeBinary', () => { validateVector(generate.fixedSizeBinary()); }); describe('DateDay', () => { validateVector(generate.dateDay()); }); diff --git a/js/test/unit/generated-data-validators.ts b/js/test/unit/generated-data-validators.ts index 52f642d2a6e89..57ee94876c300 100644 --- a/js/test/unit/generated-data-validators.ts +++ b/js/test/unit/generated-data-validators.ts @@ -113,7 +113,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expected = values[i]; expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); if (keys && keys.length > 0) { test(`dictionary indices should match`, () => { @@ -126,7 +128,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { ? expect(indices.get(i)).toBe(keys[i]) : expect(indices.get(i)).toBeNull(); } - } catch (e) { throw new Error(`${indices}[${i}]: ${e}`); } + } catch (e) { + throw new Error(`${indices}[${i}]: ${e}`); + } }); } test(`sets expected values`, () => { @@ -139,7 +143,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { actual = vector.get(i); expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); test(`iterates expected values`, () => { expect.hasAssertions(); @@ -149,7 +155,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expected = values[++i]; expect(actual).toArrowCompare(expected); } - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); test(`indexOf returns expected values`, () => { expect.hasAssertions(); @@ -169,7 +177,9 @@ function vectorTests(values: any[], vector: Vector, keys?: number[]) { expect(vector.indexOf('purple elephants')).toBe(-1); expect(vector.indexOf('whistling wombats')).toBe(-1); expect(vector.indexOf('carnivorous novices')).toBe(-1); - } catch (e: any) { throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); } + } catch (e: any) { + throw new Error(`${vector}[${i}]:\n\t${e && e.stack || e}`); + } }); } From 7e14e1ed38fde29de0a9345a7c1e4e831d5b364d Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Thu, 22 Jun 2023 17:03:28 -0700 Subject: [PATCH 14/37] Disable test auto run --- js/.vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/.vscode/settings.json b/js/.vscode/settings.json index 113a662180c3c..0bef41b8ac9a8 100644 --- a/js/.vscode/settings.json +++ b/js/.vscode/settings.json @@ -17,6 +17,6 @@ "editor.defaultFormatter": "vscode.typescript-language-features" }, "jest.jestCommandLine": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.js", - "jest.autoRun": {"watch": false, "onSave": "test-src-file"}, + "jest.autoRun": {"watch": false, "onSave": false}, "typescript.preferences.importModuleSpecifierEnding": "js" } From 5790d141591339e8b6bf0f74cc5a5abde2659906 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Thu, 22 Jun 2023 17:03:53 -0700 Subject: [PATCH 15/37] Fix index of for largeUTF8 --- js/src/data.ts | 4 ++-- js/src/visitor/indexof.ts | 1 + js/test/generate-test-data.ts | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/js/src/data.ts b/js/src/data.ts index 62631d0d9e8e7..343e7f1fd3553 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -263,7 +263,7 @@ import { } from './type.js'; import { Visitor } from './visitor.js'; -import { toArrayBufferView, toInt32Array, toUint8Array } from './util/buffer.js'; +import { toArrayBufferView, toBigInt64Array, toInt32Array, toUint8Array } from './util/buffer.js'; class MakeDataVisitor extends Visitor { public visit(props: any): Data { @@ -310,7 +310,7 @@ class MakeDataVisitor extends Visitor { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); const nullBitmap = toUint8Array(props['nullBitmap']); - const valueOffsets = toInt32Array(props['valueOffsets']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } diff --git a/js/src/visitor/indexof.ts b/js/src/visitor/indexof.ts index 5357aa44fbd4c..c72196502a6b0 100644 --- a/js/src/visitor/indexof.ts +++ b/js/src/visitor/indexof.ts @@ -167,6 +167,7 @@ IndexOfVisitor.prototype.visitFloat16 = indexOfValue; IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index 4e0df5c98446f..2d450dc3057f7 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -51,6 +51,7 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; From 200e5e590c7616f05966a7cf978ba80370f90f3a Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 23 Jun 2023 10:13:01 -0700 Subject: [PATCH 16/37] Fix offset math with bigint --- js/src/builder.ts | 3 ++- js/src/builder/buffer.ts | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index fa0e45c3a9f18..d858cfe19e62f 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -290,7 +290,8 @@ export abstract class Builder { valueOffsets = _offsets?.flush(length); } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists // Binary, Utf8, LargeUtf8 - data = _values?.flush(Number(_offsets.last())); + const last = _offsets.last(); + data = _values?.flush(last != undefined ? Number(last) : last); } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, and Interval) data = _values?.flush(length); } diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 0fae2e3a16ca5..2ed8265991317 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -143,7 +143,7 @@ export class OffsetsBufferBuilder extends DataBufferBuilder< public set(index: number, value: T['TOffset'][0]) { const offset = this.length - 1; const buffer = this.reserve(index - offset + 1).buffer; - if (offset < index++) { + if (offset < index++ && offset >= 0) { buffer.fill(buffer[offset], offset, index); } buffer[index] = buffer[index - 1] + value; From 7bf542677e871f0614181eaca2b6739cf0ca6587 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 23 Jun 2023 10:38:57 -0700 Subject: [PATCH 17/37] Update type names --- js/src/builder.ts | 4 ++-- js/src/builder/buffer.ts | 12 ++++++------ js/src/data.ts | 2 +- js/src/type.ts | 16 ++++++++-------- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index d858cfe19e62f..de8f1beefae66 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -163,7 +163,7 @@ export abstract class Builder { public toVector() { return new Vector([this.flush()]); } public get ArrayType() { return this.type.ArrayType; } - public get OffsetType() { return this.type.OffsetType; } + public get OffsetArrayType() { return this.type.OffsetArrayType; } public get nullCount() { return this._nulls.numInvalid; } public get numChildren() { return this.children.length; } @@ -199,7 +199,7 @@ export abstract class Builder { return this.children.reduce((size, child) => size + child.reservedByteLength, size); } - declare protected _offsets: DataBufferBuilder; + declare protected _offsets: DataBufferBuilder; public get valueOffsets() { return this._offsets ? this._offsets.buffer : null; } declare protected _values: BufferBuilder; diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 2ed8265991317..c0226c2168786 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -126,21 +126,21 @@ export class BitmapBufferBuilder extends DataBufferBuilder { } /** @ignore */ -export class OffsetsBufferBuilder extends DataBufferBuilder { +export class OffsetsBufferBuilder extends DataBufferBuilder { constructor(type: T) { - super(new type.OffsetType(1), 1); - this.toNumber = type.OffsetType === BigInt64Array ? BigInt : bigIntToNumber as any; + super(new type.OffsetArrayType(1), 1); + this.toNumber = type.OffsetArrayType === BigInt64Array ? BigInt : bigIntToNumber as any; } /** * The correct number constructor for the buffer type. */ - public toNumber: ((number: number | bigint) => T['TOffset'] extends BigInt64Array ? bigint : number); + public toNumber: ((number: number | bigint) => T['TOffsetArray'] extends BigInt64Array ? bigint : number); - public append(value: T['TOffset'][0]) { + public append(value: T['TOffsetArray'][0]) { return this.set(this.length - 1, value); } - public set(index: number, value: T['TOffset'][0]) { + public set(index: number, value: T['TOffsetArray'][0]) { const offset = this.length - 1; const buffer = this.reserve(index - offset + 1).buffer; if (offset < index++ && offset >= 0) { diff --git a/js/src/data.ts b/js/src/data.ts index 343e7f1fd3553..eba190a9860ba 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -34,7 +34,7 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export interface Buffers { - [BufferType.OFFSET]: T['TOffset']; + [BufferType.OFFSET]: T['TOffsetArray']; [BufferType.DATA]: T['TArray']; [BufferType.VALIDITY]: Uint8Array; [BufferType.TYPE]: T['TArray']; diff --git a/js/src/type.ts b/js/src/type.ts index 27bfaabdf6035..6d9db4ebe1ae6 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -19,7 +19,7 @@ import { Field } from './schema.js'; import { Vector } from './vector.js'; import { MapRow } from './row/map.js'; import { StructRow, StructRowProxy } from './row/struct.js'; -import { BigIntArrayConstructor, TypedArrayConstructor } from './interfaces.js'; +import { ArrayCtor, BigIntArrayConstructor, TypedArrayConstructor } from './interfaces.js'; import { bigIntToNumber } from './util/bigint.js'; import { @@ -38,11 +38,11 @@ export type IsSigned = { 'true': true; 'false': false }; export interface DataType { readonly TType: TType; readonly TArray: any; - readonly TOffset: any; + readonly TOffsetArray: any; readonly TValue: any; readonly TChildren: TChildren; readonly ArrayType: any; - readonly OffsetType: TypedArrayConstructor | BigIntArrayConstructor; + readonly OffsetArrayType: ArrayCtor; readonly children: Field[]; } @@ -82,7 +82,7 @@ export abstract class DataType { (proto).children = null; (proto).ArrayType = Array; - (proto).OffsetType = Int32Array; + (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'DataType'; })(DataType.prototype); } @@ -250,7 +250,7 @@ export class Binary extends DataType { } /** @ignore */ -export interface Utf8 extends DataType { TArray: Uint8Array; TOffset: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetType: TypedArrayConstructor } +export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ export class Utf8 extends DataType { constructor() { @@ -265,7 +265,7 @@ export class Utf8 extends DataType { } /** @ignore */ -export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffset: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetType: BigIntArrayConstructor } +export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ export class LargeUtf8 extends DataType { constructor() { @@ -275,7 +275,7 @@ export class LargeUtf8 extends DataType { public toString() { return `LargeUtf8`; } protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { (proto).ArrayType = Uint8Array; - (proto).OffsetType = BigInt64Array; + (proto).OffsetArrayType = BigInt64Array; return proto[Symbol.toStringTag] = 'LargeUtf8'; })(LargeUtf8.prototype); } @@ -567,7 +567,7 @@ export class FixedSizeBinary extends DataType { protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; (proto).ArrayType = Uint8Array; - (proto).OffsetType = Int32Array; + (proto).OffsetArrayType = Int32Array; return proto[Symbol.toStringTag] = 'FixedSizeBinary'; })(FixedSizeBinary.prototype); } From 0357214ad3bbd8ee9fcfc43b8e8b926de26f83e0 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 2 Dec 2023 10:44:21 -0500 Subject: [PATCH 18/37] More updates --- dev/archery/archery/integration/datagen.py | 3 +-- js/src/data.ts | 3 ++- js/src/ipc/metadata/json.ts | 3 ++- js/src/ipc/metadata/message.ts | 3 ++- js/src/visitor/bytelength.ts | 3 ++- js/src/visitor/vectorloader.ts | 3 +++ 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 80cc1c1e76425..30561f4a137c0 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1787,8 +1787,7 @@ def _temp_path(): generate_primitive_case([0, 0, 0], name='primitive_zerolength'), generate_primitive_large_offsets_case([17, 20]) - .skip_tester('C#') - .skip_tester('JS'), + .skip_tester('C#'), generate_null_case([10, 0]), diff --git a/js/src/data.ts b/js/src/data.ts index 86b9d08663ac9..e290e1eaf478d 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -30,6 +30,7 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export type NullBuffer = Uint8Array | null | undefined; /** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable | undefined; /** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable | undefined; +/** @ignore */ export type LargeValueOffsetsBuffer = BigInt64Array | ArrayLike | Iterable | undefined; /** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable | undefined; /** @ignore */ @@ -444,7 +445,7 @@ interface DurationDataProps extends DataProps_ { data?: D interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } -interface LargeUtf8DataProps extends DataProps_ { valueOffsets: BigInt64Array; data?: DataBuffer } +interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts index f1f306730ddba..5e11e2e218299 100644 --- a/js/src/ipc/metadata/json.ts +++ b/js/src/ipc/metadata/json.ts @@ -20,7 +20,7 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -150,6 +150,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'null': return new Null(); case 'binary': return new Binary(); case 'utf8': return new Utf8(); + case 'largeUtf8': return new LargeUtf8(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); case 'struct': return new Struct(children || []); diff --git a/js/src/ipc/metadata/message.ts b/js/src/ipc/metadata/message.ts index 27c9b92d6897b..cf05bff54cfba 100644 --- a/js/src/ipc/metadata/message.ts +++ b/js/src/ipc/metadata/message.ts @@ -56,7 +56,7 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, Binary, Decimal, FixedSizeBinary, + Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -433,6 +433,7 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); case Type['Utf8']: return new Utf8(); + case Type['LargeUtf8']: return new LargeUtf8(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); case Type['Struct_']: return new Struct(children || []); diff --git a/js/src/visitor/bytelength.ts b/js/src/visitor/bytelength.ts index 72d6148a52fd8..c3bfadd50e155 100644 --- a/js/src/visitor/bytelength.ts +++ b/js/src/visitor/bytelength.ts @@ -26,7 +26,7 @@ import { Type, TimeUnit, UnionMode } from '../enum.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, + Bool, Null, Utf8, LargeUtf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, Union, DenseUnion, SparseUnion, } from '../type.js'; @@ -40,6 +40,7 @@ export interface GetByteLengthVisitor extends Visitor { getVisitFn(node: T): (data: Data>, index: number) => number; visitBinary(data: Data, index: number): number; visitUtf8(data: Data, index: number): number; + visitLargeUtf8(data: Data, index: number): number; visitList(data: Data, index: number): number; visitDenseUnion(data: Data, index: number): number; visitSparseUnion(data: Data, index: number): number; diff --git a/js/src/visitor/vectorloader.ts b/js/src/visitor/vectorloader.ts index db34edad9a1c1..e9800ec4409e4 100644 --- a/js/src/visitor/vectorloader.ts +++ b/js/src/visitor/vectorloader.ts @@ -71,6 +71,9 @@ export class VectorLoader extends Visitor { public visitUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { + return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); + } public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } From ad9cfd5e207220b41aa38968f5131811eff81e6e Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 2 Dec 2023 11:11:15 -0500 Subject: [PATCH 19/37] revert change and fix comment --- js/.vscode/settings.json | 2 +- js/src/enum.ts | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/js/.vscode/settings.json b/js/.vscode/settings.json index 0bef41b8ac9a8..113a662180c3c 100644 --- a/js/.vscode/settings.json +++ b/js/.vscode/settings.json @@ -17,6 +17,6 @@ "editor.defaultFormatter": "vscode.typescript-language-features" }, "jest.jestCommandLine": "node --experimental-vm-modules node_modules/jest/bin/jest.js --config jest.config.js", - "jest.autoRun": {"watch": false, "onSave": false}, + "jest.autoRun": {"watch": false, "onSave": "test-src-file"}, "typescript.preferences.importModuleSpecifierEnding": "js" } diff --git a/js/src/enum.ts b/js/src/enum.ts index 2ef3b8624773f..9c02192294793 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -137,8 +137,7 @@ export enum MessageHeader { * nested type consisting of other data types, or another data type (e.g. a * timestamp encoded as an int64). * - * **Note**: Only enum values 0-18 (NONE through Duration) are written to an Arrow - * IPC payload. + * **Note**: Only positive enum values are written to an Arrow IPC payload. * * The rest of the values are specified here so TypeScript can narrow the type * signatures further beyond the base Arrow Types. The Arrow DataTypes include From b3cd5b2c6afe27a39266c3d81d373fb9c6a18eeb Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 2 Dec 2023 13:36:15 -0500 Subject: [PATCH 20/37] Get rid of toNumber --- js/src/builder/binary.ts | 4 ++-- js/src/builder/buffer.ts | 9 +-------- js/src/builder/largeutf8.ts | 20 ++++++++++++++++++-- js/src/type.ts | 10 +++++----- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/js/src/builder/binary.ts b/js/src/builder/binary.ts index bcdefab9d3e66..3c12ddf34abb0 100644 --- a/js/src/builder/binary.ts +++ b/js/src/builder/binary.ts @@ -42,11 +42,11 @@ export class BinaryBuilder extends VariableWidthBuilder ((((Math.ceil(len) * BPE) + 63) & ~63) || 64) / BPE; @@ -129,14 +128,8 @@ export class BitmapBufferBuilder extends DataBufferBuilder { export class OffsetsBufferBuilder extends DataBufferBuilder { constructor(type: T) { super(new type.OffsetArrayType(1), 1); - this.toNumber = type.OffsetArrayType === BigInt64Array ? BigInt : bigIntToNumber as any; } - /** - * The correct number constructor for the buffer type. - */ - public toNumber: ((number: number | bigint) => T['TOffsetArray'] extends BigInt64Array ? bigint : number); - public append(value: T['TOffsetArray'][0]) { return this.set(this.length - 1, value); } @@ -151,7 +144,7 @@ export class OffsetsBufferBuilder extends DataBufferBuilder< } public flush(length = this.length - 1) { if (length > this.length) { - this.set(length - 1, this.toNumber(0)); + this.set(length - 1, this.BYTES_PER_ELEMENT > 4 ? 0n : 0); } return super.flush(length + 1); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index 002bb8265a73d..0de97a4586e6e 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -38,7 +38,23 @@ export class LargeUtf8Builder extends VariableWidthBuilder, pendingLength: number): void { } + // TODO: move to largeBinaryBuilder when implemented + // protected _flushPending(pending: Map, pendingLength: number): void { } + protected _flushPending(pending: Map, pendingLength: number) { + const offsets = this._offsets; + const data = this._values.reserve(pendingLength).buffer; + let offset = 0; + for (const [index, value] of pending) { + if (value === undefined) { + offsets.set(index, 0n); + } else { + const length = value.length; + data.set(value, offset); + offsets.set(index, BigInt(length)); + offset += length; + } + } + } } -(LargeUtf8Builder.prototype as any)._flushPending = (BinaryBuilder.prototype as any)._flushPending; +// (LargeUtf8Builder.prototype as any)._flushPending = (LargeBinaryBuilder.prototype as any)._flushPending; diff --git a/js/src/type.ts b/js/src/type.ts index b66b277a19d0c..6223d0316f17a 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -236,7 +236,7 @@ Object.defineProperty(Float32.prototype, 'ArrayType', { value: Float32Array }); Object.defineProperty(Float64.prototype, 'ArrayType', { value: Float64Array }); /** @ignore */ -export interface Binary extends DataType { TArray: Uint8Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor } +export interface Binary extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ export class Binary extends DataType { constructor() { @@ -478,13 +478,13 @@ export class Duration extends DataType { } /** @ignore */ -export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); }} +export class DurationSecond extends Duration { constructor() { super(TimeUnit.SECOND); } } /** @ignore */ -export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); }} +export class DurationMillisecond extends Duration { constructor() { super(TimeUnit.MILLISECOND); } } /** @ignore */ -export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); }} +export class DurationMicrosecond extends Duration { constructor() { super(TimeUnit.MICROSECOND); } } /** @ignore */ -export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); }} +export class DurationNanosecond extends Duration { constructor() { super(TimeUnit.NANOSECOND); } } /** @ignore */ From 8b2b4430ffe6e3640b200f41aafadd22cde590f1 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 2 Dec 2023 13:40:25 -0500 Subject: [PATCH 21/37] Remove unused get method --- js/src/builder.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index ffb41f66b50f8..98d0e17911177 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -163,7 +163,6 @@ export abstract class Builder { public toVector() { return new Vector([this.flush()]); } public get ArrayType() { return this.type.ArrayType; } - public get OffsetArrayType() { return this.type.OffsetArrayType; } public get nullCount() { return this._nulls.numInvalid; } public get numChildren() { return this.children.length; } From aaa137ce0a90bd0a8287127c75a3af66d8ff5ee7 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 2 Dec 2023 13:45:29 -0500 Subject: [PATCH 22/37] Skip js test for now --- dev/archery/archery/integration/datagen.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 30561f4a137c0..80cc1c1e76425 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1787,7 +1787,8 @@ def _temp_path(): generate_primitive_case([0, 0, 0], name='primitive_zerolength'), generate_primitive_large_offsets_case([17, 20]) - .skip_tester('C#'), + .skip_tester('C#') + .skip_tester('JS'), generate_null_case([10, 0]), From 4dbca13a8987dd0d6c6010c76be1c5a3107cd3d4 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 2 Dec 2023 13:45:59 -0500 Subject: [PATCH 23/37] remove unused import --- js/src/builder/largeutf8.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index 0de97a4586e6e..601e390aefcc4 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -17,7 +17,6 @@ import { LargeUtf8 } from '../type.js'; import { encodeUtf8 } from '../util/utf8.js'; -import { BinaryBuilder } from './binary.js'; import { BufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; From 22bad7fdabf697745c835b192957b1a012e49c7d Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 2 Dec 2023 15:45:56 -0500 Subject: [PATCH 24/37] Support serialization to JSON and back for large utf8 --- js/src/builder.ts | 9 ++-- js/src/ipc/metadata/json.ts | 2 +- js/src/util/bigint.ts | 7 +++ js/src/util/buffer.ts | 6 ++- js/src/visitor/jsontypeassembler.ts | 2 +- js/src/visitor/jsonvectorassembler.ts | 4 +- js/src/visitor/vectorassembler.ts | 18 ++++++- js/src/visitor/vectorloader.ts | 4 +- js/test/data/tables.ts | 2 +- js/test/generate-test-data.ts | 46 ++++++++--------- js/test/unit/builders/largeUtf8-tests.ts | 65 ++++++++++++++++++++++++ js/test/unit/vector/vector-tests.ts | 24 ++++++++- 12 files changed, 148 insertions(+), 41 deletions(-) create mode 100644 js/test/unit/builders/largeUtf8-tests.ts diff --git a/js/src/builder.ts b/js/src/builder.ts index 98d0e17911177..8f1c4b685d7d8 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -284,13 +284,10 @@ export abstract class Builder { let valueOffsets; const { type, length, nullCount, _typeIds, _offsets, _values, _nulls } = this; - if (typeIds = _typeIds?.flush(length)) { // Unions - // DenseUnions + if (typeIds = _typeIds?.flush(length)) { // Unions, DenseUnions valueOffsets = _offsets?.flush(length); - } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8), and Lists - // Binary, Utf8, LargeUtf8 - const last = _offsets.last(); - data = _values?.flush(last != undefined ? Number(last) : last); + } else if (valueOffsets = _offsets?.flush(length)) { // Variable-width primitives (Binary, Utf8, LargeUtf8), and Lists + data = _values?.flush(_offsets.last()); } else { // Fixed-width primitives (Int, Float, Decimal, Time, Timestamp, Duration and Interval) data = _values?.flush(length); } diff --git a/js/src/ipc/metadata/json.ts b/js/src/ipc/metadata/json.ts index 5e11e2e218299..b669c0c612f8a 100644 --- a/js/src/ipc/metadata/json.ts +++ b/js/src/ipc/metadata/json.ts @@ -150,7 +150,7 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'null': return new Null(); case 'binary': return new Binary(); case 'utf8': return new Utf8(); - case 'largeUtf8': return new LargeUtf8(); + case 'largeutf8': return new LargeUtf8(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); case 'struct': return new Struct(children || []); diff --git a/js/src/util/bigint.ts b/js/src/util/bigint.ts index 5af2f7f052386..a0b79e5aad66c 100644 --- a/js/src/util/bigint.ts +++ b/js/src/util/bigint.ts @@ -24,3 +24,10 @@ export function bigIntToNumber(number: bigint | number): number { } return Number(number); } + +export function bigIntMin(a: T, b: T) { + if (a < b) { + return a; + } + return b; +} diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts index dd8edf11f9258..5207278a62a64 100644 --- a/js/src/util/buffer.ts +++ b/js/src/util/buffer.ts @@ -208,10 +208,12 @@ export async function* toArrayBufferViewAsyncIterator(Arra /** @ignore */ export const toUint8ClampedArrayAsyncIterator = (input: ArrayBufferViewAsyncIteratorInput) => toArrayBufferViewAsyncIterator(Uint8ClampedArray, input); /** @ignore */ -export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array) { +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array): Int32Array; +export function rebaseValueOffsets(offset: bigint, length: number, valueOffsets: BigInt64Array): BigInt64Array; +export function rebaseValueOffsets(offset: number | bigint, length: number, valueOffsets: any) { // If we have a non-zero offset, create a new offsets array with the values // shifted by the start offset, such that the new start offset is 0 - if (offset !== 0) { + if (offset != 0) { valueOffsets = valueOffsets.slice(0, length); for (let i = -1, n = valueOffsets.length; ++i < n;) { valueOffsets[i] += offset; diff --git a/js/src/visitor/jsontypeassembler.ts b/js/src/visitor/jsontypeassembler.ts index 885ab5e68efa5..a6746a858ecb4 100644 --- a/js/src/visitor/jsontypeassembler.ts +++ b/js/src/visitor/jsontypeassembler.ts @@ -67,7 +67,7 @@ export class JSONTypeAssembler extends Visitor { return { 'name': ArrowType[typeId].toLowerCase(), 'unit': IntervalUnit[unit] }; } public visitDuration({ typeId, unit }: T) { - return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit]}; + return { 'name': ArrowType[typeId].toLocaleLowerCase(), 'unit': TimeUnit[unit] }; } public visitList({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; diff --git a/js/src/visitor/jsonvectorassembler.ts b/js/src/visitor/jsonvectorassembler.ts index a0e91a0c23dd9..9a3cb8601a434 100644 --- a/js/src/visitor/jsonvectorassembler.ts +++ b/js/src/visitor/jsonvectorassembler.ts @@ -42,7 +42,7 @@ export interface JSONVectorAssembler extends Visitor { visitInt(data: Data): { DATA: number[] | string[] }; visitFloat(data: Data): { DATA: number[] }; visitUtf8(data: Data): { DATA: string[]; OFFSET: number[] }; - visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: bigint[] }; + visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; @@ -102,7 +102,7 @@ export class JSONVectorAssembler extends Visitor { return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; } public visitLargeUtf8(data: Data) { - return { 'DATA': [...new Vector([data])], 'OFFSET': [...data.valueOffsets] }; + return { 'DATA': [...new Vector([data])], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; } public visitBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))], OFFSET: [...data.valueOffsets] }; diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index 3e72e701ddaf6..faef717758e32 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -27,8 +27,9 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, LargeUtf8, } from '../type.js'; +import { bigIntMin, bigIntToNumber } from '../util/bigint.js'; /** @ignore */ export interface VectorAssembler extends Visitor { @@ -211,6 +212,19 @@ function assembleFlatListVector(this: VectorAssembler, return this; } +/** @ignore */ +function assembleLargeFlatListVector(this: VectorAssembler, data: Data) { + const { length, values, valueOffsets } = data; + const { [0]: begin, [length]: end } = valueOffsets; + // TODO: we can probably merge this method with assembleFlatListVector if byteLength is a bigint when values array can be indexed by bigints + const byteLength = bigIntMin(end - begin, BigInt(values.byteLength) - begin); + // Push in the order FlatList types read their buffers + addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // valueOffsets buffer first + // TODO: remove bigIntToNumber when the values array can be indexed by bigints + addBuffer.call(this, values.subarray(bigIntToNumber(begin), bigIntToNumber(begin + byteLength))); // sliced values buffer second + return this; +} + /** @ignore */ function assembleListVector(this: VectorAssembler, data: Data) { const { length, valueOffsets } = data; @@ -234,7 +248,7 @@ VectorAssembler.prototype.visitBool = assembleBoolVector; VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; -VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitLargeUtf8 = assembleLargeFlatListVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; diff --git a/js/src/visitor/vectorloader.ts b/js/src/visitor/vectorloader.ts index e9800ec4409e4..35f28f49baada 100644 --- a/js/src/visitor/vectorloader.ts +++ b/js/src/visitor/vectorloader.ts @@ -154,7 +154,7 @@ export class JSONVectorLoader extends VectorLoader { return nullCount <= 0 ? new Uint8Array(0) : packBools(this.sources[offset]); } protected readOffsets(_type: T, { offset } = this.nextBufferRange()) { - return toArrayBufferView(Uint8Array, toArrayBufferView(Int32Array, this.sources[offset])); + return toArrayBufferView(Uint8Array, toArrayBufferView(_type.OffsetArrayType, this.sources[offset])); } protected readTypeIds(type: T, { offset } = this.nextBufferRange()) { return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, this.sources[offset])); @@ -173,7 +173,7 @@ export class JSONVectorLoader extends VectorLoader { return binaryDataFromJSON(sources[offset] as string[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); - } else if (DataType.isUtf8(type)) { + } else if (DataType.isUtf8(type) || DataType.isLargeUtf8(type)) { return encodeUtf8((sources[offset] as string[]).join('')); } return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, sources[offset].map((x) => +x))); diff --git a/js/test/data/tables.ts b/js/test/data/tables.ts index 28aed7e4feccf..449cfe1fb853a 100644 --- a/js/test/data/tables.ts +++ b/js/test/data/tables.ts @@ -27,7 +27,7 @@ const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map' const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ 'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', 'utf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', + 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', 'dictionary', 'intervalDayTime', 'intervalYearMonth', diff --git a/js/test/generate-test-data.ts b/js/test/generate-test-data.ts index b7938dea0786f..9d7b038331fe6 100644 --- a/js/test/generate-test-data.ts +++ b/js/test/generate-test-data.ts @@ -246,7 +246,7 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -687,31 +687,31 @@ function createBitmap(length: number, nullCount: number) { } function createVariableWidthOffsets32(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { - const offsets = new Int32Array(length + 1); - iterateBitmap(length, nullBitmap, (i, valid) => { - if (!valid) { - offsets[i + 1] = offsets[i]; - } else { - do { - offsets[i + 1] = offsets[i] + Math.min(max, Math.max(min, Math.trunc(rand() * max))); - } while (!allowEmpty && offsets[i + 1] === offsets[i]); - } - }); - return offsets; + const offsets = new Int32Array(length + 1); + iterateBitmap(length, nullBitmap, (i, valid) => { + if (!valid) { + offsets[i + 1] = offsets[i]; + } else { + do { + offsets[i + 1] = offsets[i] + Math.min(max, Math.max(min, Math.trunc(rand() * max))); + } while (!allowEmpty && offsets[i + 1] === offsets[i]); + } + }); + return offsets; } function createVariableWidthOffsets64(length: number, nullBitmap: Uint8Array, min = 10, max = Number.POSITIVE_INFINITY, allowEmpty = true) { - const offsets = new BigInt64Array(length + 1); - iterateBitmap(length, nullBitmap, (i, valid) => { - if (!valid) { - offsets[i + 1] = offsets[i]; - } else { - do { - offsets[i + 1] = offsets[i] + BigInt(Math.min(max, Math.max(min, Math.trunc(rand() * max)))); - } while (!allowEmpty && offsets[i + 1] === offsets[i]); - } - }); - return offsets; + const offsets = new BigInt64Array(length + 1); + iterateBitmap(length, nullBitmap, (i, valid) => { + if (!valid) { + offsets[i + 1] = offsets[i]; + } else { + do { + offsets[i + 1] = offsets[i] + BigInt(Math.min(max, Math.max(min, Math.trunc(rand() * max)))); + } while (!allowEmpty && offsets[i + 1] === offsets[i]); + } + }); + return offsets; } function createVariableWidthBytes(length: number, nullBitmap: Uint8Array, offsets: Int32Array | BigInt64Array, getBytes: (index: number) => Uint8Array) { diff --git a/js/test/unit/builders/largeUtf8-tests.ts b/js/test/unit/builders/largeUtf8-tests.ts new file mode 100644 index 0000000000000..c789d5dbb1671 --- /dev/null +++ b/js/test/unit/builders/largeUtf8-tests.ts @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import 'web-streams-polyfill'; + +import { validateVector } from './utils.js'; +import { + encodeAll, + encodeEach, + encodeEachDOM, + encodeEachNode, + stringsNoNulls, + stringsWithNAs, + stringsWithNulls, + stringsWithEmpties +} from './utils.js'; + +import { Vector, LargeUtf8 } from 'apache-arrow'; + +const testDOMStreams = process.env.TEST_DOM_STREAMS === 'true'; +const testNodeStreams = process.env.TEST_NODE_STREAMS === 'true'; + +describe('LargeUtf8Builder', () => { + runTestsWithEncoder('encodeAll', encodeAll(() => new LargeUtf8())); + runTestsWithEncoder('encodeEach: 5', encodeEach(() => new LargeUtf8(), 5)); + runTestsWithEncoder('encodeEach: 25', encodeEach(() => new LargeUtf8(), 25)); + runTestsWithEncoder('encodeEach: undefined', encodeEach(() => new LargeUtf8(), void 0)); + testDOMStreams && runTestsWithEncoder('encodeEachDOM: 25', encodeEachDOM(() => new LargeUtf8(), 25)); + testNodeStreams && runTestsWithEncoder('encodeEachNode: 25', encodeEachNode(() => new LargeUtf8(), 25)); +}); + +function runTestsWithEncoder(name: string, encode: (vals: (string | null)[], nullVals?: any[]) => Promise>) { + describe(`${encode.name} ${name}`, () => { + it(`encodes strings no nulls`, async () => { + const vals = stringsNoNulls(20); + validateVector(vals, await encode(vals, []), []); + }); + it(`encodes strings with nulls`, async () => { + const vals = stringsWithNulls(20); + validateVector(vals, await encode(vals, [null]), [null]); + }); + it(`encodes strings using n/a as the null value rep`, async () => { + const vals = stringsWithNAs(20); + validateVector(vals, await encode(vals, ['n/a']), ['n/a']); + }); + it(`encodes strings using \\0 as the null value rep`, async () => { + const vals = stringsWithEmpties(20); + validateVector(vals, await encode(vals, ['\0']), ['\0']); + }); + }); +} diff --git a/js/test/unit/vector/vector-tests.ts b/js/test/unit/vector/vector-tests.ts index a259cbef87772..bfcf0d8547861 100644 --- a/js/test/unit/vector/vector-tests.ts +++ b/js/test/unit/vector/vector-tests.ts @@ -16,7 +16,7 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, util, Vector, vectorFromArray + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Timestamp, TimeUnit, Utf8, LargeUtf8, util, Vector, vectorFromArray } from 'apache-arrow'; describe(`makeVectorFromArray`, () => { @@ -196,6 +196,28 @@ describe(`Utf8Vector`, () => { }); }); +describe(`LargeUtf8Vector`, () => { + const values = ['foo', 'bar', 'baz', 'foo bar', 'bar']; + const vector = vectorFromArray(values, new LargeUtf8); + + test(`has largeUtf8 type`, () => { + expect(vector.type).toBeInstanceOf(LargeUtf8); + }); + + test(`is not memoized`, () => { + expect(vector.isMemoized).toBe(false); + const memoizedVector = vector.memoize(); + expect(memoizedVector.isMemoized).toBe(true); + const unMemoizedVector = vector.unmemoize(); + expect(unMemoizedVector.isMemoized).toBe(false); + }); + + basicVectorTests(vector, values, ['abc', '123']); + describe(`sliced`, () => { + basicVectorTests(vector.slice(1, 3), values.slice(1, 3), ['foo', 'abc']); + }); +}); + describe(`ListVector`, () => { const values = [[1, 2], [1, 2, 3]]; const vector = vectorFromArray(values); From 28795dcd8a0811a96a752bd8fbfac4abffafc20b Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 2 Dec 2023 16:38:47 -0500 Subject: [PATCH 25/37] Pre es2020 compat --- js/src/builder/buffer.ts | 2 +- js/src/builder/largeutf8.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index bd0c3b3c53fa2..53d2793c9b422 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -144,7 +144,7 @@ export class OffsetsBufferBuilder extends DataBufferBuilder< } public flush(length = this.length - 1) { if (length > this.length) { - this.set(length - 1, this.BYTES_PER_ELEMENT > 4 ? 0n : 0); + this.set(length - 1, this.BYTES_PER_ELEMENT > 4 ? BigInt(0) : 0); } return super.flush(length + 1); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index 601e390aefcc4..fddfeaf8e7b17 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -45,7 +45,7 @@ export class LargeUtf8Builder extends VariableWidthBuilder Date: Sat, 2 Dec 2023 20:12:08 -0500 Subject: [PATCH 26/37] Remove offset from buffer --- js/src/builder/buffer.ts | 7 ------- 1 file changed, 7 deletions(-) diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 53d2793c9b422..aaedb43146069 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -26,11 +26,6 @@ const sliceOrExtendArray = (arr: T, len = 0) arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) ) as T; -/** @ignore */ -export interface BufferBuilder { - readonly offset: number; -} - /** @ignore */ export class BufferBuilder { @@ -88,8 +83,6 @@ export class BufferBuilder { } } -(BufferBuilder.prototype as any).offset = 0; - /** @ignore */ export class DataBufferBuilder extends BufferBuilder { public last() { return this.get(this.length - 1); } From a79f5cae6b12b17d31e183ffb31c00ebcf446663 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sun, 3 Dec 2023 21:48:46 -0500 Subject: [PATCH 27/37] Update enum.ts Co-authored-by: Kyle Barron --- js/src/enum.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/enum.ts b/js/src/enum.ts index 9c02192294793..764ea64e63338 100644 --- a/js/src/enum.ts +++ b/js/src/enum.ts @@ -137,7 +137,7 @@ export enum MessageHeader { * nested type consisting of other data types, or another data type (e.g. a * timestamp encoded as an int64). * - * **Note**: Only positive enum values are written to an Arrow IPC payload. + * **Note**: Only non-negative enum values are written to an Arrow IPC payload. * * The rest of the values are specified here so TypeScript can narrow the type * signatures further beyond the base Arrow Types. The Arrow DataTypes include From f1042a52ab484cc33362fab968b04a4386af73cc Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Mon, 4 Dec 2023 16:14:48 -0500 Subject: [PATCH 28/37] I think this is the right buffer builder. --- js/src/builder/binary.ts | 4 ++-- js/src/builder/buffer.ts | 5 ++--- js/src/builder/largeutf8.ts | 4 ++-- js/src/builder/utf8.ts | 4 ++-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/js/src/builder/binary.ts b/js/src/builder/binary.ts index 3c12ddf34abb0..e60abb6c7b42a 100644 --- a/js/src/builder/binary.ts +++ b/js/src/builder/binary.ts @@ -17,14 +17,14 @@ import { Binary } from '../type.js'; import { toUint8Array } from '../util/buffer.js'; -import { BufferBuilder } from './buffer.js'; +import { DataBufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; /** @ignore */ export class BinaryBuilder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new DataBufferBuilder(new Uint8Array(0)); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index aaedb43146069..435b0658c39eb 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -27,7 +27,7 @@ const sliceOrExtendArray = (arr: T, len = 0) ) as T; /** @ignore */ -export class BufferBuilder { +export abstract class BufferBuilder { constructor(buffer: T, stride = 1) { this.buffer = buffer; @@ -49,8 +49,7 @@ export class BufferBuilder { public get reservedLength() { return this.buffer.length / this.stride; } public get reservedByteLength() { return this.buffer.byteLength; } - // @ts-ignore - public set(index: number, value: T[0]) { return this; } + public abstract set(index: number, value: T[0]): this; public append(value: T[0]) { return this.set(this.length, value); } public reserve(extra: number) { if (extra > 0) { diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index fddfeaf8e7b17..677702bfcd3b4 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -17,14 +17,14 @@ import { LargeUtf8 } from '../type.js'; import { encodeUtf8 } from '../util/utf8.js'; -import { BufferBuilder } from './buffer.js'; +import { DataBufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; /** @ignore */ export class LargeUtf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new DataBufferBuilder(new Uint8Array(0)); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/utf8.ts b/js/src/builder/utf8.ts index 53b8306cbaffd..34864a2fcb032 100644 --- a/js/src/builder/utf8.ts +++ b/js/src/builder/utf8.ts @@ -18,14 +18,14 @@ import { Utf8 } from '../type.js'; import { encodeUtf8 } from '../util/utf8.js'; import { BinaryBuilder } from './binary.js'; -import { BufferBuilder } from './buffer.js'; +import { DataBufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; /** @ignore */ export class Utf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new DataBufferBuilder(new Uint8Array(0)); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); From e66d5aeb69a90df75bb2b3263d87e76794e98281 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Mon, 4 Dec 2023 20:29:25 -0500 Subject: [PATCH 29/37] Add types to builder flush method --- js/src/builder.ts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index 8f1c4b685d7d8..1a4c52f871bbf 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -277,11 +277,10 @@ export abstract class Builder { * @returns A `Data` of the buffers and children representing the values written. */ public flush(): Data { - - let data; - let typeIds; - let nullBitmap; - let valueOffsets; + let data: BufferBuilder | undefined; + let typeIds: Int8Array; + let nullBitmap: Uint8Array | undefined; + let valueOffsets: T['TOffsetArray']; const { type, length, nullCount, _typeIds, _offsets, _values, _nulls } = this; if (typeIds = _typeIds?.flush(length)) { // Unions, DenseUnions From b05f38d05fa58713878f79e62b308186e02de6a9 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Tue, 5 Dec 2023 16:49:37 -0500 Subject: [PATCH 30/37] Correct offset buffer type --- js/src/data.ts | 4 ++-- js/src/util/buffer.ts | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/js/src/data.ts b/js/src/data.ts index e290e1eaf478d..145ee9d049cb4 100644 --- a/js/src/data.ts +++ b/js/src/data.ts @@ -30,7 +30,7 @@ import { popcnt_bit_range, truncateBitmap } from './util/bit.js'; /** @ignore */ export type NullBuffer = Uint8Array | null | undefined; /** @ignore */ export type TypeIdsBuffer = Int8Array | ArrayLike | Iterable | undefined; /** @ignore */ export type ValueOffsetsBuffer = Int32Array | ArrayLike | Iterable | undefined; -/** @ignore */ export type LargeValueOffsetsBuffer = BigInt64Array | ArrayLike | Iterable | undefined; +/** @ignore */ export type LargeValueOffsetsBuffer = BigInt64Array | ArrayLike | Iterable | undefined; /** @ignore */ export type DataBuffer = T['TArray'] | ArrayLike | Iterable | undefined; /** @ignore */ @@ -445,7 +445,7 @@ interface DurationDataProps extends DataProps_ { data?: D interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } -interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer; data?: DataBuffer } +interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts index 5207278a62a64..e88620ebfd7cf 100644 --- a/js/src/util/buffer.ts +++ b/js/src/util/buffer.ts @@ -83,9 +83,9 @@ export function joinUint8Arrays(chunks: Uint8Array[], size?: number | null): [Ui } /** @ignore */ -export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable | ArrayLike | ByteBuffer | string | null | undefined | - IteratorResult | ArrayLike | ByteBuffer | string | null | undefined> | - ReadableStreamReadResult | ArrayLike | ByteBuffer | string | null | undefined>; +export type ArrayBufferViewInput = ArrayBufferView | ArrayBufferLike | ArrayBufferView | Iterable | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined | + IteratorResult | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined> | + ReadableStreamReadResult | Iterable | ArrayLike | ArrayLike | ByteBuffer | string | null | undefined>; /** @ignore */ export function toArrayBufferView< From dae4775643fbecdfca3840121aaa855a932ac14f Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Tue, 5 Dec 2023 17:05:07 -0500 Subject: [PATCH 31/37] Revert "I think this is the right buffer builder." This reverts commit f1042a52ab484cc33362fab968b04a4386af73cc. --- js/src/builder/binary.ts | 4 ++-- js/src/builder/buffer.ts | 5 +++-- js/src/builder/largeutf8.ts | 4 ++-- js/src/builder/utf8.ts | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/js/src/builder/binary.ts b/js/src/builder/binary.ts index e60abb6c7b42a..3c12ddf34abb0 100644 --- a/js/src/builder/binary.ts +++ b/js/src/builder/binary.ts @@ -17,14 +17,14 @@ import { Binary } from '../type.js'; import { toUint8Array } from '../util/buffer.js'; -import { DataBufferBuilder } from './buffer.js'; +import { BufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; /** @ignore */ export class BinaryBuilder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new DataBufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(new Uint8Array(0)); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 435b0658c39eb..aaedb43146069 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -27,7 +27,7 @@ const sliceOrExtendArray = (arr: T, len = 0) ) as T; /** @ignore */ -export abstract class BufferBuilder { +export class BufferBuilder { constructor(buffer: T, stride = 1) { this.buffer = buffer; @@ -49,7 +49,8 @@ export abstract class BufferBuilder { public get reservedLength() { return this.buffer.length / this.stride; } public get reservedByteLength() { return this.buffer.byteLength; } - public abstract set(index: number, value: T[0]): this; + // @ts-ignore + public set(index: number, value: T[0]) { return this; } public append(value: T[0]) { return this.set(this.length, value); } public reserve(extra: number) { if (extra > 0) { diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index 677702bfcd3b4..fddfeaf8e7b17 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -17,14 +17,14 @@ import { LargeUtf8 } from '../type.js'; import { encodeUtf8 } from '../util/utf8.js'; -import { DataBufferBuilder } from './buffer.js'; +import { BufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; /** @ignore */ export class LargeUtf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new DataBufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(new Uint8Array(0)); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/utf8.ts b/js/src/builder/utf8.ts index 34864a2fcb032..53b8306cbaffd 100644 --- a/js/src/builder/utf8.ts +++ b/js/src/builder/utf8.ts @@ -18,14 +18,14 @@ import { Utf8 } from '../type.js'; import { encodeUtf8 } from '../util/utf8.js'; import { BinaryBuilder } from './binary.js'; -import { DataBufferBuilder } from './buffer.js'; +import { BufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; /** @ignore */ export class Utf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new DataBufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(new Uint8Array(0)); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); From 6949b6c78bb2e728e9fd77507a5aa5f8a9b8714f Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Tue, 5 Dec 2023 23:45:42 -0500 Subject: [PATCH 32/37] Better rounding method This method is numerically more stable. ```ts const round64_1 = (num: number) => Math.ceil(num / 64) * 64 const round64_2 = (num: number) => ((num) + 63) & ~63 const round64_3 = (num: number) => num + 63 - (num + 63) % 64; const round64_4 = (num: number) => num - 1 - (num - 1) % 64 + 64; const round64_5 = (num: number) => ((num + 63) >> 6) << 6; { for (const f of [round64_1, round64_2, round64_3, round64_4, round64_5]) { console.log(f) console.log(f(-1)) console.log(f(0)) console.log(f(1)) console.log(f(2)) console.log(f(63)) console.log(f(64)) console.log(f(65)) console.log(f(2 ** 16)) console.log(f(2 ** 32)) console.log(f(2 ** 42)) console.log(f(Number.MAX_SAFE_INTEGER / 100)) console.log(f(Number.MAX_SAFE_INTEGER)) console.log(f(Number.MAX_SAFE_INTEGER * 10)) } } ``` --- js/src/builder/buffer.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index aaedb43146069..e49e73d4089c8 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -20,7 +20,10 @@ import { TypedArray, BigIntArray, ArrayCtor } from '../interfaces.js'; import { DataType } from '../type.js'; /** @ignore */ -const roundLengthUpToNearest64Bytes = (len: number, BPE: number) => ((((Math.ceil(len) * BPE) + 63) & ~63) || 64) / BPE; +function roundLengthUpToNearest64Bytes(len: number, BPE: number) { + const bytesMinus1 = len * BPE - 1; + return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE; +} /** @ignore */ const sliceOrExtendArray = (arr: T, len = 0) => ( arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) From d2f7b0bbd9108e199e13cb3d50e39420536aaaea Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 6 Dec 2023 00:17:24 -0500 Subject: [PATCH 33/37] bring back ceil --- js/src/builder/buffer.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index e49e73d4089c8..402172059682c 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -21,7 +21,7 @@ import { DataType } from '../type.js'; /** @ignore */ function roundLengthUpToNearest64Bytes(len: number, BPE: number) { - const bytesMinus1 = len * BPE - 1; + const bytesMinus1 = Math.ceil(len) * BPE - 1; return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE; } /** @ignore */ From d63ef0c8d29141ea4646c42a33a5f73a207cf346 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 8 Dec 2023 10:38:39 -0500 Subject: [PATCH 34/37] style: format --- js/src/interfaces.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/src/interfaces.ts b/js/src/interfaces.ts index 11138a0cb656d..707d01bb14cca 100644 --- a/js/src/interfaces.ts +++ b/js/src/interfaces.ts @@ -339,11 +339,11 @@ type DataTypeToBuilder = { [Type.Interval]: T extends type.Interval ? IntervalBuilder : never; [Type.IntervalDayTime]: T extends type.IntervalDayTime ? IntervalDayTimeBuilder : never; [Type.IntervalYearMonth]: T extends type.IntervalYearMonth ? IntervalYearMonthBuilder : never; - [Type.Duration]: T extends type.Duration ? DurationBuilder: never; + [Type.Duration]: T extends type.Duration ? DurationBuilder : never; [Type.DurationSecond]: T extends type.DurationSecond ? DurationSecondBuilder : never; [Type.DurationMillisecond]: T extends type.DurationMillisecond ? DurationMillisecondBuilder : never; - [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder: never; - [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder: never; + [Type.DurationMicrosecond]: T extends type.DurationMicrosecond ? DurationMicrosecondBuilder : never; + [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder : never; [Type.Map]: T extends type.Map_ ? MapBuilder : never; [Type.List]: T extends type.List ? ListBuilder : never; [Type.Struct]: T extends type.Struct ? StructBuilder : never; From 806800be6588204cf8fcd5876a1e4e1022d5e7db Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Fri, 8 Dec 2023 10:50:50 -0500 Subject: [PATCH 35/37] update status --- docs/source/status.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/status.rst b/docs/source/status.rst index 6167d3037ba77..09ed6bcfcf222 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -66,7 +66,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Utf8 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Large Utf8 | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +| Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Binary View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ From 1da7ef6ac56f820905670910e6b1be5f4160fdf6 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 16 Dec 2023 10:32:53 -0500 Subject: [PATCH 36/37] Simpify vector assembler --- js/src/util/buffer.ts | 6 +++--- js/src/visitor/vectorassembler.ts | 31 +++++++++++++++++++------------ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/js/src/util/buffer.ts b/js/src/util/buffer.ts index e88620ebfd7cf..4f4379dedf6d8 100644 --- a/js/src/util/buffer.ts +++ b/js/src/util/buffer.ts @@ -209,11 +209,11 @@ export async function* toArrayBufferViewAsyncIterator(Arra /** @ignore */ export function rebaseValueOffsets(offset: number, length: number, valueOffsets: Int32Array): Int32Array; -export function rebaseValueOffsets(offset: bigint, length: number, valueOffsets: BigInt64Array): BigInt64Array; -export function rebaseValueOffsets(offset: number | bigint, length: number, valueOffsets: any) { +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: BigInt64Array): BigInt64Array; +export function rebaseValueOffsets(offset: number, length: number, valueOffsets: any) { // If we have a non-zero offset, create a new offsets array with the values // shifted by the start offset, such that the new start offset is 0 - if (offset != 0) { + if (offset !== 0) { valueOffsets = valueOffsets.slice(0, length); for (let i = -1, n = valueOffsets.length; ++i < n;) { valueOffsets[i] += offset; diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index faef717758e32..302401303ca6c 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -205,23 +205,30 @@ function assembleFlatVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; const { [0]: begin, [length]: end } = valueOffsets; - const byteLength = Math.min(end - begin, values.byteLength - begin); - // Push in the order FlatList types read their buffers - addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // valueOffsets buffer first - addBuffer.call(this, values.subarray(begin, begin + byteLength)); // sliced values buffer second - return this; + return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); } /** @ignore */ -function assembleLargeFlatListVector(this: VectorAssembler, data: Data) { +function assembleLargeFlatListVector(this: VectorAssembler, data: Data) { const { length, values, valueOffsets } = data; - const { [0]: begin, [length]: end } = valueOffsets; - // TODO: we can probably merge this method with assembleFlatListVector if byteLength is a bigint when values array can be indexed by bigints - const byteLength = bigIntMin(end - begin, BigInt(values.byteLength) - begin); + const begin = bigIntToNumber(valueOffsets[0]); + const end = bigIntToNumber(valueOffsets[length]); + return _assembleFlatListVector.call(this, length, begin, end, values, valueOffsets); +} + +/** @ignore */ +function _assembleFlatListVector( + this: VectorAssembler, + length: number, + begin: number, + end: number, + values: T['TArray'], + valueOffsets: T['TOffsetArray'] +) { + const byteLength = Math.min(end - begin, values.byteLength - begin); // Push in the order FlatList types read their buffers - addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets)); // valueOffsets buffer first - // TODO: remove bigIntToNumber when the values array can be indexed by bigints - addBuffer.call(this, values.subarray(bigIntToNumber(begin), bigIntToNumber(begin + byteLength))); // sliced values buffer second + addBuffer.call(this, rebaseValueOffsets(-begin, length + 1, valueOffsets as any)); // valueOffsets buffer first + addBuffer.call(this, values.subarray(begin, begin + byteLength)); // sliced values buffer second return this; } From 611791620d0127a6a4166ea743fc7ef063260446 Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Sat, 16 Dec 2023 10:45:50 -0500 Subject: [PATCH 37/37] Separate code paths for utf and large utf --- js/src/util/bigint.ts | 7 ------- js/src/visitor/get.ts | 16 +++++++++++++--- js/src/visitor/set.ts | 26 ++++++++++++++++++++------ js/src/visitor/vectorassembler.ts | 2 +- 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/js/src/util/bigint.ts b/js/src/util/bigint.ts index a0b79e5aad66c..5af2f7f052386 100644 --- a/js/src/util/bigint.ts +++ b/js/src/util/bigint.ts @@ -24,10 +24,3 @@ export function bigIntToNumber(number: bigint | number): number { } return Number(number); } - -export function bigIntMin(a: T, b: T) { - if (a < b) { - return a; - } - return b; -} diff --git a/js/src/visitor/get.ts b/js/src/visitor/get.ts index 42df8d031e7a1..a801c90047c89 100644 --- a/js/src/visitor/get.ts +++ b/js/src/visitor/get.ts @@ -21,6 +21,7 @@ import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; import { MapRow } from '../row/map.js'; import { StructRow, StructRowProxy } from '../row/struct.js'; +import { bigIntToNumber } from '../util/bigint.js'; import { decodeUtf8 } from '../util/utf8.js'; import { TypeToDataType } from '../interfaces.js'; import { uint16ToFloat64 } from '../util/math.js'; @@ -115,13 +116,22 @@ function wrapGet(fn: (data: Data, _1: any) => any) { /** @ignore */ const getNull = (_data: Data, _index: number): T['TValue'] => null; /** @ignore */ -const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array | BigInt64Array, index: number) => { +const getVariableWidthBytes = (values: Uint8Array, valueOffsets: Int32Array, index: number) => { if (index + 1 >= valueOffsets.length) { return null as any; } const x = valueOffsets[index]; const y = valueOffsets[index + 1]; - return values.subarray(Number(x), Number(y)); + return values.subarray(x, y); +}; +/** @ignore */ +const getLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: BigInt64Array, index: number) => { + if (index + 1 >= valueOffsets.length) { + return null as any; + } + const x = bigIntToNumber(valueOffsets[index]); + const y = bigIntToNumber(valueOffsets[index + 1]); + return values.subarray(x, y); }; /** @ignore */ @@ -158,7 +168,7 @@ const getUtf8 = ({ values, valueOffsets }: Data, index: numbe }; /** @ignore */ const getLargeUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { - const bytes = getVariableWidthBytes(values, valueOffsets, index); + const bytes = getLargeVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; diff --git a/js/src/visitor/set.ts b/js/src/visitor/set.ts index 15db17ad37594..a439ec8311fd6 100644 --- a/js/src/visitor/set.ts +++ b/js/src/visitor/set.ts @@ -19,6 +19,7 @@ import { Data } from '../data.js'; import { Field } from '../schema.js'; import { Vector } from '../vector.js'; import { Visitor } from '../visitor.js'; +import { bigIntToNumber } from '../util/bigint.js'; import { encodeUtf8 } from '../util/utf8.js'; import { TypeToDataType } from '../interfaces.js'; import { float64ToUint16 } from '../util/math.js'; @@ -124,11 +125,20 @@ export const setEpochMsToNanosecondsLong = (data: Int32Array, index: number, epo }; /** @ignore */ -export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { +export const setVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { if (index + 1 < valueOffsets.length) { - const x = valueOffsets[index] as T extends Int32Array ? number : bigint; - const y = valueOffsets[index + 1] as T extends Int32Array ? number : bigint; - values.set(value.subarray(0, Number(y - x)), Number(x)); + const x = valueOffsets[index]; + const y = valueOffsets[index + 1]; + values.set(value.subarray(0, y - x), x); + } +}; + +/** @ignore */ +export const setLargeVariableWidthBytes = (values: Uint8Array, valueOffsets: T, index: number, value: Uint8Array) => { + if (index + 1 < valueOffsets.length) { + const x = bigIntToNumber(valueOffsets[index]); + const y = bigIntToNumber(valueOffsets[index + 1]); + values.set(value.subarray(0, y - x), x); } }; @@ -166,9 +176,13 @@ export const setFixedSizeBinary = ({ stride, values } /** @ignore */ const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ -const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { +const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); }; +/** @ignore */ +const setLargeUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => { + setLargeVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -367,7 +381,7 @@ SetVisitor.prototype.visitFloat16 = wrapSet(setFloat16); SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); -SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitLargeUtf8 = wrapSet(setLargeUtf8); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); diff --git a/js/src/visitor/vectorassembler.ts b/js/src/visitor/vectorassembler.ts index 302401303ca6c..7a9d3bdd57b0d 100644 --- a/js/src/visitor/vectorassembler.ts +++ b/js/src/visitor/vectorassembler.ts @@ -29,7 +29,7 @@ import { Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, Bool, Null, Utf8, Binary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, LargeUtf8, } from '../type.js'; -import { bigIntMin, bigIntToNumber } from '../util/bigint.js'; +import { bigIntToNumber } from '../util/bigint.js'; /** @ignore */ export interface VectorAssembler extends Visitor {