{-# LANGUAGE OverloadedStrings #-}
module Hydra.Sources.Tier4.Langs.Parquet.Format where
import Hydra.Sources.Tier3.All
import Hydra.Dsl.Annotations
import Hydra.Dsl.Bootstrap
import Hydra.Dsl.Types as Types
parquetFormatModule :: Module
parquetFormatModule :: Module
parquetFormatModule = Namespace
-> [Element] -> [Module] -> [Module] -> Maybe [Char] -> Module
Module Namespace
ns [Element]
elements [Module
hydraCoreModule] [Module]
tier0Modules (Maybe [Char] -> Module) -> Maybe [Char] -> Module
forall a b. (a -> b) -> a -> b
$
[Char] -> Maybe [Char]
forall a. a -> Maybe a
Just ([Char]
"A model for the Parquet format. Based on the Thrift-based specification at:\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift")
where
ns :: Namespace
ns = [Char] -> Namespace
Namespace [Char]
"hydra/langs/parquet/format"
def :: [Char] -> Type -> Element
def = Namespace -> [Char] -> Type -> Element
datatype Namespace
ns
parquet :: [Char] -> Type
parquet = Namespace -> [Char] -> Type
typeref Namespace
ns
elements :: [Element]
elements = [
[Char] -> Type -> Element
def [Char]
"Type" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Types supported by Parquet. These types are intended to be used in combination " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"with the encodings to control the on disk storage format. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"For example INT16 is not included as a type since a good encoding of INT32 " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"would handle this.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[[Char]] -> Type
enum [
[Char]
"boolean",
[Char]
"int32",
[Char]
"int64",
[Char]
"float",
[Char]
"double",
[Char]
"byteArray",
[Char]
"fixedLenByteArray"],
[Char] -> Type -> Element
def [Char]
"FieldRepetitionType" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"Representation of Schemas" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"required"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"This field is required (can not be null) and each record has exactly 1 value." Type
unit,
[Char]
"optional"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"The field is optional (can be null) and each record has 0 or 1 values." Type
unit,
[Char]
"repeated"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"The field is repeated and can contain 0 or more values" Type
unit],
[Char] -> Type -> Element
def [Char]
"Statistics" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"Statistics per row group and per page. All fields are optional." (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"nullCount"[Char] -> Type -> FieldType
>: Type -> Type
optional Type
uint64,
[Char]
"distinctCount"[Char] -> Type -> FieldType
>: Type -> Type
optional Type
uint64,
[Char]
"maxValue"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Max value for the column, determined by its ColumnOrder. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Values are encoded using PLAIN encoding, except that variable-length byte " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"arrays do not include a length prefix.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary,
[Char]
"minValue"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Max value for the column, determined by its ColumnOrder. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Values are encoded using PLAIN encoding, except that variable-length byte " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"arrays do not include a length prefix.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary],
[Char] -> Type -> Element
def [Char]
"DecimalType" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Decimal logical type annotation. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"To maintain forward-compatibility in v1, implementations using this logical " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"type must also set scale and precision on the annotated SchemaElement. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Allowed for physical types: INT32, INT64, FIXED, and BINARY") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"scale"[Char] -> Type -> FieldType
>: Type
int32,
[Char]
"precision"[Char] -> Type -> FieldType
>: Type
int32],
[Char] -> Type -> Element
def [Char]
"TimeUnit" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[[Char]] -> Type
enum [
[Char]
"millis",
[Char]
"micros",
[Char]
"nanos"],
[Char] -> Type -> Element
def [Char]
"TimestampType" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Timestamp logical type annotation. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Allowed for physical types: INT64") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"isAdjustedToUtc"[Char] -> Type -> FieldType
>: Type
boolean,
[Char]
"unit"[Char] -> Type -> FieldType
>: [Char] -> Type
parquet [Char]
"TimeUnit"],
[Char] -> Type -> Element
def [Char]
"TimeType" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Time logical type annotation. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Allowed for physical types: INT32 (millis), INT64 (micros, nanos)") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"isAdjustedToUtc"[Char] -> Type -> FieldType
>: Type
boolean,
[Char]
"unit"[Char] -> Type -> FieldType
>: [Char] -> Type
parquet [Char]
"TimeUnit"],
[Char] -> Type -> Element
def [Char]
"IntType" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Integer logical type annotation. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"bitWidth must be 8, 16, 32, or 64. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Allowed for physical types: INT32, INT64") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"bitWidth"[Char] -> Type -> FieldType
>: Type
uint8,
[Char]
"isSigned"[Char] -> Type -> FieldType
>: Type
boolean],
[Char] -> Type -> Element
def [Char]
"LogicalType" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"LogicalType annotations to replace ConvertedType. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"To maintain compatibility, implementations using LogicalType for a " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"SchemaElement aust also set the corresponding ConvertedType (if any) " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"from the following table.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"string"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"use ConvertedType UTF8" Type
unit,
[Char]
"map"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"use ConvertedType MAP" Type
unit,
[Char]
"list"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"use ConvertedType LIST" Type
unit,
[Char]
"enum"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"use ConvertedType ENUM" Type
unit,
[Char]
"decimal"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"use ConvertedType DECIMAL + SchemaElement.{scale, precision}" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"DecimalType",
[Char]
"date"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"use ConvertedType DATE" Type
unit,
[Char]
"time"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"use ConvertedType TIME_MICROS for TIME(isAdjustedToUTC = *, unit = MICROS). " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"use ConvertedType TIME_MILLIS for TIME(isAdjustedToUTC = *, unit = MILLIS)") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"TimeType",
[Char]
"timestamp"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"use ConvertedType TIMESTAMP_MICROS for TIMESTAMP(isAdjustedToUTC = *, unit = MICROS). " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"use ConvertedType TIMESTAMP_MILLIS for TIMESTAMP(isAdjustedToUTC = *, unit = MILLIS)") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"TimestampType",
[Char]
"integer"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"use ConvertedType INT_* or UINT_*" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"IntType",
[Char]
"unknown"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"no compatible ConvertedType" Type
unit,
[Char]
"json"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"use ConvertedType JSON" Type
unit,
[Char]
"bson"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"use ConvertedType BSON" Type
unit,
[Char]
"uuid"[Char] -> Type -> FieldType
>: [Char] -> Type -> Type
doc [Char]
"no compatible ConvertedType" Type
unit],
[Char] -> Type -> Element
def [Char]
"SchemaElement" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Represents a element inside a schema definition.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"- if it is a group (inner node) then type is undefined and num_children is defined\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"- if it is a primitive type (leaf) then type is defined and num_children is undefined\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"the nodes are listed in depth first traversal order.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"type"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Data type for this field. Not set if the current element is a non-leaf node" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"Type",
[Char]
"typeLength"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Otherwise, if specified, this is the maximum bit length to store any of the values. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"(e.g. a low cardinality INT col could have this set to 3). Note that this is " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"in the schema, and therefore fixed for the entire file.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int32,
[Char]
"repetitionType"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"repetition of the field. The root of the schema does not have a repetition_type. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"All other nodes must have one") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"FieldRepetitionType",
[Char]
"name"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Name of the field in the schema"
Type
string,
[Char]
"numChildren"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Nested fields. Since thrift does not support nested fields, " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"the nesting is flattened to a single list by a depth-first traversal. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"The children count is used to construct the nested relationship. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"This field is not set when the element is a primitive type") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int32,
[Char]
"fieldId"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"When the original schema supports field ids, this will save the " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"original field id in the parquet schema") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int32,
[Char]
"logicalType"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"The logical type of this SchemaElement. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"LogicalType replaces ConvertedType, but ConvertedType is still required " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"for some logical types to ensure forward-compatibility in format v1.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"LogicalType"],
[Char] -> Type -> Element
def [Char]
"Encoding" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Encodings supported by Parquet. Not all encodings are valid for all types. These " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"enums are also used to specify the encoding of definition and repetition levels. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"See the accompanying doc for the details of the more complicated encodings.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"plain"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Default encoding.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"BOOLEAN - 1 bit per value. 0 is false; 1 is true.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"INT32 - 4 bytes per value. Stored as little-endian.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"INT64 - 8 bytes per value. Stored as little-endian.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"FLOAT - 4 bytes per value. IEEE. Stored as little-endian.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"DOUBLE - 8 bytes per value. IEEE. Stored as little-endian.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"FIXED_LEN_BYTE_ARRAY - Just the bytes.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
unit,
[Char]
"rle"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Group packed run length encoding. Usable for definition/repetition levels " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"encoding and Booleans (on one bit: 0 is false; 1 is true.)") Type
unit,
[Char]
"bitPacked"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Bit packed encoding. This can only be used if the data has a known max " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"width. Usable for definition/repetition levels encoding.") Type
unit,
[Char]
"deltaBinaryPacked"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Delta encoding for integers. This can be used for int columns and works best " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"on sorted data") Type
unit,
[Char]
"deltaLengthByteArray"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Encoding for byte arrays to separate the length values and the data. The lengths " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"are encoded using DELTA_BINARY_PACKED") Type
unit,
[Char]
"deltaByteArray"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Suffixes are stored as delta length byte arrays.") Type
unit,
[Char]
"rleDictionary"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Dictionary encoding: the ids are encoded using the RLE encoding") Type
unit,
[Char]
"byteStreamSplit"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Encoding for floating-point data. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"K byte-streams are created where K is the size in bytes of the data type. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"The individual bytes of an FP value are scattered to the corresponding stream and " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"the streams are concatenated. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"This itself does not reduce the size of the data but can lead to better compression " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"afterwards.") Type
unit],
[Char] -> Type -> Element
def [Char]
"CompressionCodec" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Supported compression algorithms. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Codecs added in format version X.Y can be read by readers based on X.Y and later. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Codec support may vary between readers based on the format version and " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"libraries available at runtime. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"See Compression.md for a detailed specification of these algorithms.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"uncompressed"[Char] -> Type -> FieldType
>: Type
unit,
[Char]
"snappy"[Char] -> Type -> FieldType
>: Type
unit,
[Char]
"gzip"[Char] -> Type -> FieldType
>: Type
unit,
[Char]
"lzo"[Char] -> Type -> FieldType
>: Type
unit,
[Char]
"brotli"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Added in 2.4" Type
unit,
[Char]
"zstd"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Added in 2.4" Type
unit,
[Char]
"lz4Raw"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Added in 2.9" Type
unit],
[Char] -> Type -> Element
def [Char]
"PageType" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[[Char]] -> Type
enum [
[Char]
"dataPage",
[Char]
"indexPage",
[Char]
"dictionaryPage",
[Char]
"dataPageV2"],
[Char] -> Type -> Element
def [Char]
"BoundaryOrder" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Enum to annotate whether lists of min/max elements inside ColumnIndex " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"are ordered and if so, in which direction.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[[Char]] -> Type
enum [
[Char]
"unordered",
[Char]
"ascending",
[Char]
"descending"],
[Char] -> Type -> Element
def [Char]
"DataPageHeader" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"Data page header" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"numValues"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Number of values, including NULLs, in this data page."
Type
int32,
[Char]
"encoding"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Encoding used for this data page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"Encoding",
[Char]
"definitionLevelEncoding"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Encoding used for definition levels" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"Encoding",
[Char]
"repetitionLevelEncoding"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Encoding used for repetition levels" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"Encoding",
[Char]
"statistics"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Optional statistics for the data in this page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"Statistics"],
[Char] -> Type -> Element
def [Char]
"IndexPageHeader" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$ [FieldType] -> Type
record [],
[Char] -> Type -> Element
def [Char]
"DictionaryPageHeader" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"The dictionary page must be placed at the first position of the column chunk " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"if it is partly or completely dictionary encoded. At most one dictionary page " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"can be placed in a column chunk.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"numValues"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Number of values in the dictionary" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"encoding"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Encoding using this dictionary page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"Encoding",
[Char]
"isSorted"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"If true, the entries in the dictionary are sorted in ascending order" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
boolean],
[Char] -> Type -> Element
def [Char]
"DataPageHeaderV2" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"New page format allowing reading levels without decompressing the data " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Repetition and definition levels are uncompressed " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"The remaining section containing the data is compressed if is_compressed is true") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"numValues"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Number of values, including NULLs, in this data page." (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"numNulls"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Number of NULL values, in this data page. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Number of non-null = num_values - num_nulls which is also the number of values in the data section") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"numRows"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Number of rows in this data page. which means pages change on record boundaries (r = 0)" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"encoding"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Encoding used for data in this page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"Encoding",
[Char]
"definitionLevelsByteLength"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"length of the definition levels" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"repetitionLevelsByteLength"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"length of the repetition levels" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"isCompressed"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"whether the values are compressed. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Which means the section of the page between " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"is compressed with the compression_codec. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"If missing it is considered compressed") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
boolean,
[Char]
"statistics"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"optional statistics for the data in this page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"Statistics"],
[Char] -> Type -> Element
def [Char]
"BloomFilterAlgorithm" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"The algorithm used in Bloom filter." (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"block"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Block-based Bloom filter." Type
unit],
[Char] -> Type -> Element
def [Char]
"BloomFilterHash" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"The hash function used in Bloom filter. This function takes the hash of a column value " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"using plain encoding.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"xxhash"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"xxHash Strategy." Type
unit],
[Char] -> Type -> Element
def [Char]
"BloomFilterCompression" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"The compression used in the Bloom filter." (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[[Char]] -> Type
enum [
[Char]
"uncompressed"],
[Char] -> Type -> Element
def [Char]
"BloomFilterHeader" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Bloom filter header is stored at beginning of Bloom filter data of each column " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"and followed by its bitset.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"numBytes"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"The size of bitset in bytes" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"algorithm"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"The algorithm for setting bits." (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"BloomFilterAlgorithm",
[Char]
"hash"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"The hash function used for Bloom filter." (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"BloomFilterHash",
[Char]
"compression"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"The compression used in the Bloom filter" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"BloomFilterCompression"],
[Char] -> Type -> Element
def [Char]
"PageHeader" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"type"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"the type of the page: indicates which of the *_header fields is set" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"PageType",
[Char]
"uncompressedPageSize"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Uncompressed page size in bytes (not including this header)" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"compressedPageSize"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Compressed (and potentially encrypted) page size in bytes, not including this header" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type
int32,
[Char]
"crc"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"The 32bit CRC for the page, to be be calculated as follows:\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"- Using the standard CRC32 algorithm\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"- On the data only, i.e. this header should not be included. 'Data'\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" hereby refers to the concatenation of the repetition levels, the\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" definition levels and the column value, in this exact order.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"- On the encoded versions of the repetition levels, definition levels and\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" column values\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"- On the compressed versions of the repetition levels, definition levels\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" and column values where possible;\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" - For v1 data pages, the repetition levels, definition levels and column\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" values are always compressed together. If a compression scheme is\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" specified, the CRC shall be calculated on the compressed version of\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" this concatenation. If no compression scheme is specified, the CRC\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" shall be calculated on the uncompressed version of this concatenation.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" - For v2 data pages, the repetition levels and definition levels are\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" handled separately from the data and are never compressed (only\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" encoded). If a compression scheme is specified, the CRC shall be\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" calculated on the concatenation of the uncompressed repetition levels,\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" uncompressed definition levels and the compressed column values.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" If no compression scheme is specified, the CRC shall be calculated on\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" the uncompressed concatenation.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"- In encrypted columns, CRC is calculated after page encryption; the\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" encryption itself is performed after page compression (if compressed)\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"If enabled, this allows for disabling checksumming in HDFS if only a few " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"pages need to be read. ") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int32,
[Char]
"dataPageHeader"[Char] -> Type -> FieldType
>:
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"DataPageHeader",
[Char]
"indexPageHeader"[Char] -> Type -> FieldType
>:
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"IndexPageHeader",
[Char]
"dictionaryPageHeader"[Char] -> Type -> FieldType
>:
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"DictionaryPageHeader",
[Char]
"dataPageHeaderV2"[Char] -> Type -> FieldType
>:
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"DataPageHeaderV2"],
[Char] -> Type -> Element
def [Char]
"KeyValue" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"Wrapper struct to store key values" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"key"[Char] -> Type -> FieldType
>: Type
string,
[Char]
"value"[Char] -> Type -> FieldType
>: Type -> Type
optional Type
string],
[Char] -> Type -> Element
def [Char]
"SortingColumn" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"Wrapper struct to specify sort order" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"columnIdx"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"The column index (in this row group)"
Type
int32,
[Char]
"descending"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"If true, indicates this column is sorted in descending order."
Type
boolean,
[Char]
"nullsFirst"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"If true, nulls will come before non-null values, otherwise, " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"nulls go at the end.")
Type
boolean],
[Char] -> Type -> Element
def [Char]
"PageEncodingStats" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"statistics of a given page type and encoding" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"pageType"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"the page type (data/dic/...)" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"PageType",
[Char]
"encoding"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"encoding of the page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"Encoding",
[Char]
"count"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"number of pages of this type with this encoding"
Type
int32],
[Char] -> Type -> Element
def [Char]
"ColumnMetaData" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"Description for column metadata" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"type"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Type of this column" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"Type",
[Char]
"encodings"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Set of all encodings used for this column. The purpose is to validate " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"whether we can decode those pages.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"Encoding",
[Char]
"pathInSchema"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Path in schema" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list Type
string,
[Char]
"codec"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Compression codec" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"CompressionCodec",
[Char]
"numValues"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Number of values in this column"
Type
int64,
[Char]
"totalUncompressedSize"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"total byte size of all uncompressed pages in this column chunk (including the headers)"
Type
int64,
[Char]
"totalCompressedSize"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"total byte size of all compressed, and potentially encrypted, pages " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"in this column chunk (including the headers)")
Type
int64,
[Char]
"keyValueMetadata"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Optional key/value metadata" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"KeyValue",
[Char]
"dataPageOffset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Byte offset from beginning of file to first data page"
Type
int64,
[Char]
"indexPageOffset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Byte offset from beginning of file to root index page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int64,
[Char]
"dictionaryPageOffset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Byte offset from the beginning of file to first (only) dictionary page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int64,
[Char]
"statistics"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"optional statistics for this column chunk" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"Statistics",
[Char]
"encodingStats"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Set of all encodings used for pages in this column chunk. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"This information can be used to determine if all data pages are " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"dictionary encoded for example") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"PageEncodingStats",
[Char]
"bloomFilterOffset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Byte offset from beginning of file to Bloom filter data." (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int64],
[Char] -> Type -> Element
def [Char]
"EncryptionWithFooterKey" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$ [FieldType] -> Type
record [],
[Char] -> Type -> Element
def [Char]
"EncryptionWithColumnKey" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"pathInSchema"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Column path in schema" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list Type
string,
[Char]
"keyMetadata"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Retrieval metadata of column encryption key" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary],
[Char] -> Type -> Element
def [Char]
"ColumnCryptoMetaData" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"encryptionWithFooterKey"[Char] -> Type -> FieldType
>: [Char] -> Type
parquet [Char]
"EncryptionWithFooterKey",
[Char]
"encryptionWithColumnKey"[Char] -> Type -> FieldType
>: [Char] -> Type
parquet [Char]
"EncryptionWithColumnKey"],
[Char] -> Type -> Element
def [Char]
"ColumnChunk" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"filePath"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"File where column data is stored. If not set, assumed to be same file as " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"metadata. This path is relative to the current file.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
string,
[Char]
"fileOffset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Byte offset in file_path to the ColumnMetaData"
Type
int64,
[Char]
"metaData"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Column metadata for this chunk. This is the same content as what is at " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"file_path/file_offset. Having it here has it replicated in the file " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"metadata.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"ColumnMetaData",
[Char]
"offsetIndexOffset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"File offset of ColumnChunk's OffsetIndex" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int64,
[Char]
"offsetIndexLength"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Size of ColumnChunk's OffsetIndex, in bytes" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int32,
[Char]
"columnIndexOffset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"File offset of ColumnChunk's ColumnIndex" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int64,
[Char]
"columnIndexLength"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Size of ColumnChunk's ColumnIndex, in bytes" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int32,
[Char]
"cryptoMetadata"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Crypto metadata of encrypted columns" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"ColumnCryptoMetaData",
[Char]
"encryptedColumnMetadata"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Encrypted column metadata for this chunk" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary],
[Char] -> Type -> Element
def [Char]
"RowGroup" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"columns"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Metadata for each column chunk in this row group. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"This list must have the same order as the SchemaElement list in FileMetaData.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"ColumnChunk",
[Char]
"totalByteSize"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Total byte size of all the uncompressed column data in this row group"
Type
int64,
[Char]
"numRows"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Number of rows in this row group"
Type
int64,
[Char]
"sortingColumns"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"If set, specifies a sort ordering of the rows in this RowGroup. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"The sorting columns can be a subset of all the columns.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"SortingColumn",
[Char]
"fileOffset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Byte offset from beginning of file to first page (data or dictionary) " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"in this row group") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int64,
[Char]
"totalCompressedSize"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Total byte size of all compressed (and potentially encrypted) column data " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"in this row group") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int64,
[Char]
"ordinal"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Row group ordinal in the file" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
int16],
[Char] -> Type -> Element
def [Char]
"ColumnOrder" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Union to specify the order used for the min_value and max_value fields for a " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"column. This union takes the role of an enhanced enum that allows rich " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"elements (which will be needed for a collation-based ordering in the future). " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Possible values are:\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"* TypeDefinedOrder - the column uses the order defined by its logical or " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"physical type (if there is no logical type).\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"If the reader does not support the value of this union, min and max stats " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"for this column should be ignored. ") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"typeOrder"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"The sort orders for logical types are:\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" UTF8 - unsigned byte-wise comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" INT8 - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" INT16 - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" INT32 - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" INT64 - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" UINT8 - unsigned comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" UINT16 - unsigned comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" UINT32 - unsigned comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" UINT64 - unsigned comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" DECIMAL - signed comparison of the represented value\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" DATE - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" TIME_MILLIS - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" TIME_MICROS - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" TIMESTAMP_MILLIS - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" TIMESTAMP_MICROS - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" INTERVAL - unsigned comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" JSON - unsigned byte-wise comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" BSON - unsigned byte-wise comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" ENUM - unsigned byte-wise comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" LIST - undefined\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" MAP - undefined\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"In the absence of logical types, the sort order is determined by the physical type:\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" BOOLEAN - false, true\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" INT32 - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" INT64 - signed comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" INT96 (only used for legacy timestamps) - undefined\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" FLOAT - signed comparison of the represented value (*)\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" DOUBLE - signed comparison of the represented value (*)\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" BYTE_ARRAY - unsigned byte-wise comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"(*) Because the sorting order is not specified properly for floating\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" point values (relations vs. total ordering) the following\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" compatibility rules should be applied when reading statistics:\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" - If the min is a NaN, it should be ignored.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" - If the max is a NaN, it should be ignored.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" - If the min is +0, the row group may contain -0 values as well.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" - If the max is -0, the row group may contain +0 values as well.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
" - When looking for NaN values, min and max should be ignored.") Type
unit],
[Char] -> Type -> Element
def [Char]
"PageLocation" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"offset"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Offset of the page in the file"
Type
int64,
[Char]
"compressedPageSize"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Size of the page, including header. Sum of compressed_page_size and header " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"length")
Type
int32,
[Char]
"firstRowIndex"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Index within the RowGroup of the first row of the page; this means pages " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"change on record boundaries (r = 0).")
Type
int64],
[Char] -> Type -> Element
def [Char]
"OffsetIndex" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"pageLocations"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"PageLocations, ordered by increasing PageLocation.offset. It is required " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"that page_locations[i].first_row_index < page_locations[i+1].first_row_index.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"PageLocation"],
[Char] -> Type -> Element
def [Char]
"ColumnIndex" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc ([Char]
"Description for ColumnIndex. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Each <array-field>[i] refers to the page at OffsetIndex.page_locations[i]") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"nullPages"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"A list of Boolean values to determine the validity of the corresponding " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"min and max values. If true, a page contains only null values, and writers " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"have to set the corresponding entries in min_values and max_values to " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"byte[0], so that all lists have the same length. If false, the " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"corresponding entries in min_values and max_values must be valid.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list Type
boolean,
[Char]
"minValues"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"minValues and maxValues are lists containing lower and upper bounds for the values of each page " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"determined by the ColumnOrder of the column. These may be the actual " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"minimum and maximum values found on a page, but can also be (more compact) " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"values that do not exist on a page. For example, instead of storing \"Blart " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Versenwald III\", a writer may set min_values[i]=\"B\", max_values[i]=\"C\". " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Such more compact values must still be valid values within the column's " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"logical type. Readers must make sure that list entries are populated before " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"using them by inspecting null_pages.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list Type
binary,
[Char]
"maxValues"[Char] -> Type -> FieldType
>: Type -> Type
list Type
binary,
[Char]
"boundaryOrder"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Stores whether both min_values and max_values are orderd and if so, in " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"which direction. This allows readers to perform binary searches in both " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"lists. Readers cannot assume that max_values[i] <= min_values[i+1], even " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"if the lists are ordered.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"BoundaryOrder",
[Char]
"nullCounts"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"A list containing the number of null values for each page" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ Type -> Type
list Type
int64],
[Char] -> Type -> Element
def [Char]
"AesGcmV1" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"aadPrefix"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"AAD prefix" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary,
[Char]
"aadFileUnique"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Unique file identifier part of AAD suffix" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary,
[Char]
"supplyAadPrefix"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"In files encrypted with AAD prefix without storing it, " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"readers must supply the prefix") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
boolean],
[Char] -> Type -> Element
def [Char]
"AesGcmCtrV1" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"aadPrefix"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"AAD prefix" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary,
[Char]
"aadFileUnique"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Unique file identifier part of AAD suffix" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary,
[Char]
"supplyAadPrefix"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"In files encrypted with AAD prefix without storing it, " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"readers must supply the prefix") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
boolean],
[Char] -> Type -> Element
def [Char]
"EncryptionAlgorithm" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
union [
[Char]
"aesGcmV1"[Char] -> Type -> FieldType
>: [Char] -> Type
parquet [Char]
"AesGcmV1",
[Char]
"aesGcmCtrV1"[Char] -> Type -> FieldType
>: [Char] -> Type
parquet [Char]
"AesGcmCtrV1"],
[Char] -> Type -> Element
def [Char]
"FileMetaData" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"Description for file metadata" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"version"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Version of this file"
Type
int32,
[Char]
"schema"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Parquet schema for this file. This schema contains metadata for all the columns. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"The schema is represented as a tree with a single root. The nodes of the tree " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"are flattened to a list by doing a depth-first traversal. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"The column metadata contains the path in the schema for that column which can be " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"used to map columns to nodes in the schema. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"The first element is the root") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"SchemaElement",
[Char]
"numRows"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Number of rows in this file"
Type
int64,
[Char]
"rowGroups"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Row groups in this file" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"RowGroup",
[Char]
"keyValueMetadata"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc [Char]
"Optional key/value metadata" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"KeyValue",
[Char]
"createdBy"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"String for application that wrote this file. This should be in the format " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"<Application> version <App Version> (build <App Build Hash>). " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
string,
[Char]
"columnOrders"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Sort order used for the min_value and max_value fields in the Statistics " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"objects and the min_values and max_values fields in the ColumnIndex " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"objects of each column in this file. Sort orders are listed in the order " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"matching the columns in the schema. The indexes are not necessary the same " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"though, because only leaf nodes of the schema are represented in the list " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"of sort orders.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Without column_orders, the meaning of the min_value and max_value fields " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"in the Statistics object and the ColumnIndex object is undefined. To ensure " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"well-defined behaviour, if these fields are written to a Parquet file, " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"column_orders must be written as well.\n" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"The obsolete min and max fields in the Statistics object are always sorted " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"by signed comparison regardless of column_orders.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ Type -> Type
list (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"ColumnOrder",
[Char]
"encryptionAlgorithm"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Encryption algorithm. This field is set only in encrypted files " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"with plaintext footer. Files with encrypted footer store algorithm id " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"in FileCryptoMetaData structure.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$ [Char] -> Type
parquet [Char]
"EncryptionAlgorithm",
[Char]
"footerSigningKeyMetadata"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Retrieval metadata of key used for signing the footer. " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"Used only in encrypted files with plaintext footer.") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary],
[Char] -> Type -> Element
def [Char]
"FileCryptoMetaData" (Type -> Element) -> Type -> Element
forall a b. (a -> b) -> a -> b
$
[Char] -> Type -> Type
doc [Char]
"Crypto metadata for files with encrypted footer" (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[FieldType] -> Type
record [
[Char]
"encryptionAlgorithm"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Encryption algorithm. This field is only used for files " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"with encrypted footer. Files with plaintext footer store algorithm id " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"inside footer (FileMetaData structure).") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
[Char] -> Type
parquet [Char]
"EncryptionAlgorithm",
[Char]
"keyMetadata"[Char] -> Type -> FieldType
>:
[Char] -> Type -> Type
doc ([Char]
"Retrieval metadata of key used for encryption of footer, " [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++
[Char]
"and (possibly) columns") (Type -> Type) -> Type -> Type
forall a b. (a -> b) -> a -> b
$
Type -> Type
optional Type
binary]]