forked from lancedb/lance
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile.proto
189 lines (166 loc) · 5.58 KB
/
file.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.file;
// A file descriptor that describes the contents of a Lance file
message FileDescriptor {
// The schema of the file
Schema schema = 1;
// The number of rows in the file
uint64 length = 2;
}
// A schema which describes the data type of each of the columns
message Schema {
// All fields in this file, including the nested fields.
repeated lance.file.Field fields = 1;
// Schema metadata.
map<string, bytes> metadata = 5;
}
// Metadata of one Lance file.
message Metadata {
// 4 was used for StatisticsMetadata in the past, but has been moved to prevent
// a bug in older readers.
reserved 4;
// Position of the manifest in the file. If it is zero, the manifest is stored
// externally.
uint64 manifest_position = 1;
// Logical offsets of each chunk group, i.e., number of the rows in each
// chunk.
repeated int32 batch_offsets = 2;
// The file position that page table is stored.
//
// A page table is a matrix of N x M x 2, where N = num_fields, and M =
// num_batches. Each cell in the table is a pair of <position:int64,
// length:int64> of the page. Both position and length are int64 values. The
// <position, length> of all the pages in the same column are then
// contiguously stored.
//
// Every field that is a part of the file will have a run in the page table.
// This includes struct columns, which will have a run of length 0 since
// they don't store any actual data.
//
// For example, for the column 5 and batch 4, we have:
// ```text
// position = page_table[5][4][0];
// length = page_table[5][4][1];
// ```
uint64 page_table_position = 3;
message StatisticsMetadata {
// The schema of the statistics.
//
// This might be empty, meaning there are no statistics. It also might not
// contain statistics for every field.
repeated Field schema = 1;
// The field ids of the statistics leaf fields.
//
// This plays a similar role to the `fields` field in the DataFile message.
// Each of these field ids corresponds to a field in the stats_schema. There
// is one per column in the stats page table.
repeated int32 fields = 2;
// The file position of the statistics page table
//
// The page table is a matrix of N x 2, where N = length of stats_fields. This is
// the same layout as the main page table, except there is always only one
// batch.
//
// For example, to get the stats column 5, we have:
// ```text
// position = stats_page_table[5][0];
// length = stats_page_table[5][1];
// ```
uint64 page_table_position = 3;
}
StatisticsMetadata statistics = 5;
} // Metadata
// Supported encodings.
enum Encoding {
// Invalid encoding.
NONE = 0;
// Plain encoding.
PLAIN = 1;
// Var-length binary encoding.
VAR_BINARY = 2;
// Dictionary encoding.
DICTIONARY = 3;
// Run-length encoding.
RLE = 4;
}
// Dictionary field metadata
message Dictionary {
/// The file offset for storing the dictionary value.
/// It is only valid if encoding is DICTIONARY.
///
/// The logic type presents the value type of the column, i.e., string value.
int64 offset = 1;
/// The length of dictionary values.
int64 length = 2;
}
// Field metadata for a column.
message Field {
enum Type {
PARENT = 0;
REPEATED = 1;
LEAF = 2;
}
Type type = 1;
// Fully qualified name.
string name = 2;
/// Field Id.
///
/// See the comment in `DataFile.fields` for how field ids are assigned.
int32 id = 3;
/// Parent Field ID. If not set, this is a top-level column.
int32 parent_id = 4;
// Logical types, support parameterized Arrow Type.
//
// PARENT types will always have logical type "struct".
//
// REPEATED types may have logical types:
// * "list"
// * "large_list"
// * "list.struct"
// * "large_list.struct"
// The final two are used if the list values are structs, and therefore the
// field is both implicitly REPEATED and PARENT.
//
// LEAF types may have logical types:
// * "null"
// * "bool"
// * "int8" / "uint8"
// * "int16" / "uint16"
// * "int32" / "uint32"
// * "int64" / "uint64"
// * "halffloat" / "float" / "double"
// * "string" / "large_string"
// * "binary" / "large_binary"
// * "date32:day"
// * "date64:ms"
// * "decimal:128:{precision}:{scale}" / "decimal:256:{precision}:{scale}"
// * "time:{unit}" / "timestamp:{unit}" / "duration:{unit}", where unit is "s", "ms", "us", "ns"
// * "dict:{value_type}:{index_type}:false"
string logical_type = 5;
// If this field is nullable.
bool nullable = 6;
Encoding encoding = 7;
/// The file offset for storing the dictionary value.
/// It is only valid if encoding is DICTIONARY.
///
/// The logic type presents the value type of the column, i.e., string value.
Dictionary dictionary = 8;
// Deprecated: optional extension type name, use metadata field ARROW:extension:name
string extension_name = 9;
// optional field metadata (e.g. extension type name/parameters)
map<string, bytes> metadata = 10;
/// The storage class of the field
///
/// This determines the rate at which the field is compacted.
///
/// Currently, there are only two storage classes:
///
/// "" - The default storage class.
/// "blob" - The field is compacted into fewer rows per fragment.
///
/// Fields that have non-default storage classes are stored in different
/// datasets (e.g. blob fields are stored in the nested "_blobs" dataset)
string storage_class = 11;
}