forked from apache/orc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorc_proto.proto
272 lines (241 loc) · 7.12 KB
/
orc_proto.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package orc.proto;
option java_package = "org.apache.orc";
message IntegerStatistics {
optional sint64 minimum = 1;
optional sint64 maximum = 2;
optional sint64 sum = 3;
}
message DoubleStatistics {
optional double minimum = 1;
optional double maximum = 2;
optional double sum = 3;
}
message StringStatistics {
optional string minimum = 1;
optional string maximum = 2;
// sum will store the total length of all strings in a stripe
optional sint64 sum = 3;
}
message BucketStatistics {
repeated uint64 count = 1 [packed=true];
}
message DecimalStatistics {
optional string minimum = 1;
optional string maximum = 2;
optional string sum = 3;
}
message DateStatistics {
// min,max values saved as days since epoch
optional sint32 minimum = 1;
optional sint32 maximum = 2;
}
message TimestampStatistics {
// min,max values saved as milliseconds since epoch
optional sint64 minimum = 1;
optional sint64 maximum = 2;
optional sint64 minimumUtc = 3;
optional sint64 maximumUtc = 4;
}
message BinaryStatistics {
// sum will store the total binary blob length in a stripe
optional sint64 sum = 1;
}
message ColumnStatistics {
optional uint64 numberOfValues = 1;
optional IntegerStatistics intStatistics = 2;
optional DoubleStatistics doubleStatistics = 3;
optional StringStatistics stringStatistics = 4;
optional BucketStatistics bucketStatistics = 5;
optional DecimalStatistics decimalStatistics = 6;
optional DateStatistics dateStatistics = 7;
optional BinaryStatistics binaryStatistics = 8;
optional TimestampStatistics timestampStatistics = 9;
optional bool hasNull = 10;
}
message RowIndexEntry {
repeated uint64 positions = 1 [packed=true];
optional ColumnStatistics statistics = 2;
}
message RowIndex {
repeated RowIndexEntry entry = 1;
}
message BloomFilter {
optional uint32 numHashFunctions = 1;
repeated fixed64 bitset = 2;
optional bytes utf8bitset = 3;
}
message BloomFilterIndex {
repeated BloomFilter bloomFilter = 1;
}
message Stream {
// if you add new index stream kinds, you need to make sure to update
// StreamName to ensure it is added to the stripe in the right area
enum Kind {
PRESENT = 0;
DATA = 1;
LENGTH = 2;
DICTIONARY_DATA = 3;
DICTIONARY_COUNT = 4;
SECONDARY = 5;
ROW_INDEX = 6;
BLOOM_FILTER = 7;
BLOOM_FILTER_UTF8 = 8;
}
optional Kind kind = 1;
optional uint32 column = 2;
optional uint64 length = 3;
}
message ColumnEncoding {
enum Kind {
DIRECT = 0;
DICTIONARY = 1;
DIRECT_V2 = 2;
DICTIONARY_V2 = 3;
}
optional Kind kind = 1;
optional uint32 dictionarySize = 2;
// The encoding of the bloom filters for this column:
// 0 or missing = none or original
// 1 = ORC-135 (utc for timestamps)
optional uint32 bloomEncoding = 3;
}
message StripeFooter {
repeated Stream streams = 1;
repeated ColumnEncoding columns = 2;
optional string writerTimezone = 3;
}
message Type {
enum Kind {
BOOLEAN = 0;
BYTE = 1;
SHORT = 2;
INT = 3;
LONG = 4;
FLOAT = 5;
DOUBLE = 6;
STRING = 7;
BINARY = 8;
TIMESTAMP = 9;
LIST = 10;
MAP = 11;
STRUCT = 12;
UNION = 13;
DECIMAL = 14;
DATE = 15;
VARCHAR = 16;
CHAR = 17;
}
optional Kind kind = 1;
repeated uint32 subtypes = 2 [packed=true];
repeated string fieldNames = 3;
optional uint32 maximumLength = 4;
optional uint32 precision = 5;
optional uint32 scale = 6;
}
message StripeInformation {
optional uint64 offset = 1;
optional uint64 indexLength = 2;
optional uint64 dataLength = 3;
optional uint64 footerLength = 4;
optional uint64 numberOfRows = 5;
}
message UserMetadataItem {
optional string name = 1;
optional bytes value = 2;
}
message StripeStatistics {
repeated ColumnStatistics colStats = 1;
}
message Metadata {
repeated StripeStatistics stripeStats = 1;
}
message Footer {
optional uint64 headerLength = 1;
optional uint64 contentLength = 2;
repeated StripeInformation stripes = 3;
repeated Type types = 4;
repeated UserMetadataItem metadata = 5;
optional uint64 numberOfRows = 6;
repeated ColumnStatistics statistics = 7;
optional uint32 rowIndexStride = 8;
// Each implementation that writes ORC files should register for a code
// 0 = ORC Java
// 1 = ORC C++
// 2 = Presto
// 3 = Scritchley Go from https://github.com/scritchley/orc
optional uint32 writer = 9;
}
enum CompressionKind {
NONE = 0;
ZLIB = 1;
SNAPPY = 2;
LZO = 3;
LZ4 = 4;
ZSTD = 5;
}
// Serialized length must be less that 255 bytes
message PostScript {
optional uint64 footerLength = 1;
optional CompressionKind compression = 2;
optional uint64 compressionBlockSize = 3;
// the version of the file format
// [0, 11] = Hive 0.11
// [0, 12] = Hive 0.12
repeated uint32 version = 4 [packed = true];
optional uint64 metadataLength = 5;
// The version of the writer that wrote the file. This number is
// updated when we make fixes or large changes to the writer so that
// readers can detect whether a given bug is present in the data.
//
// Only the Java ORC writer may use values under 6 (or missing) so that
// readers that predate ORC-202 treat the new writers correctly. Each
// writer should assign their own sequence of versions starting from 6.
//
// Version of the ORC Java writer:
// 0 = original
// 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics &
// string statistics use utf8 for min/max)
// 2 = HIVE-4243 fixed (use real column names from Hive tables)
// 3 = HIVE-12055 fixed (vectorized writer implementation)
// 4 = HIVE-13083 fixed (decimals write present stream correctly)
// 5 = ORC-101 fixed (bloom filters use utf8 consistently)
// 6 = ORC-135 fixed (timestamp statistics use utc)
//
// Version of the ORC C++ writer:
// 6 = original
//
// Version of the Presto writer:
// 6 = original
//
// Version of the Scritchley Go writer:
// 6 = original
//
optional uint32 writerVersion = 6;
// Leave this last in the record
optional string magic = 8000;
}
// The contents of the file tail that must be serialized.
// This gets serialized as part of OrcSplit, also used by footer cache.
message FileTail {
optional PostScript postscript = 1;
optional Footer footer = 2;
optional uint64 fileLength = 3;
optional uint64 postscriptLength = 4;
}