Documentation ¶
Index ¶
- func DefLevelsToBitmap(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput)
- func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) error
- func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, ...) error
- type BooleanColumnChunkReader
- func (c *BooleanColumnChunkReader) Descriptor() *schema.Column
- func (c *BooleanColumnChunkReader) Err() error
- func (c *BooleanColumnChunkReader) HasNext() bool
- func (cr *BooleanColumnChunkReader) ReadBatch(batchSize int64, values []bool, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *BooleanColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *BooleanColumnChunkReader) Type() parquet.Type
- type ByteArrayColumnChunkReader
- func (c *ByteArrayColumnChunkReader) Descriptor() *schema.Column
- func (c *ByteArrayColumnChunkReader) Err() error
- func (c *ByteArrayColumnChunkReader) HasNext() bool
- func (cr *ByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.ByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *ByteArrayColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *ByteArrayColumnChunkReader) Type() parquet.Type
- type ColumnChunkReader
- type CryptoContext
- type DataPage
- type DataPageV1
- func (p *DataPageV1) Data() []byte
- func (d *DataPageV1) DefinitionLevelEncoding() parquet.Encoding
- func (p *DataPageV1) Encoding() format.Encoding
- func (p *DataPageV1) NumValues() int32
- func (d *DataPageV1) Release()
- func (d *DataPageV1) RepetitionLevelEncoding() parquet.Encoding
- func (d *DataPageV1) Statistics() metadata.EncodedStatistics
- func (p *DataPageV1) Type() format.PageType
- func (d *DataPageV1) UncompressedSize() int64
- type DataPageV2
- func (p *DataPageV2) Data() []byte
- func (d *DataPageV2) DefinitionLevelByteLen() int32
- func (p *DataPageV2) Encoding() format.Encoding
- func (d *DataPageV2) IsCompressed() bool
- func (d *DataPageV2) NumNulls() int32
- func (p *DataPageV2) NumValues() int32
- func (d *DataPageV2) Release()
- func (d *DataPageV2) RepetitionLevelByteLen() int32
- func (d *DataPageV2) Statistics() metadata.EncodedStatistics
- func (p *DataPageV2) Type() format.PageType
- func (d *DataPageV2) UncompressedSize() int64
- type DictionaryPage
- type FixedLenByteArrayColumnChunkReader
- func (c *FixedLenByteArrayColumnChunkReader) Descriptor() *schema.Column
- func (c *FixedLenByteArrayColumnChunkReader) Err() error
- func (c *FixedLenByteArrayColumnChunkReader) HasNext() bool
- func (cr *FixedLenByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.FixedLenByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *FixedLenByteArrayColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *FixedLenByteArrayColumnChunkReader) Type() parquet.Type
- type Float32ColumnChunkReader
- func (c *Float32ColumnChunkReader) Descriptor() *schema.Column
- func (c *Float32ColumnChunkReader) Err() error
- func (c *Float32ColumnChunkReader) HasNext() bool
- func (cr *Float32ColumnChunkReader) ReadBatch(batchSize int64, values []float32, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Float32ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Float32ColumnChunkReader) Type() parquet.Type
- type Float64ColumnChunkReader
- func (c *Float64ColumnChunkReader) Descriptor() *schema.Column
- func (c *Float64ColumnChunkReader) Err() error
- func (c *Float64ColumnChunkReader) HasNext() bool
- func (cr *Float64ColumnChunkReader) ReadBatch(batchSize int64, values []float64, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Float64ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Float64ColumnChunkReader) Type() parquet.Type
- type Int32ColumnChunkReader
- func (c *Int32ColumnChunkReader) Descriptor() *schema.Column
- func (c *Int32ColumnChunkReader) Err() error
- func (c *Int32ColumnChunkReader) HasNext() bool
- func (cr *Int32ColumnChunkReader) ReadBatch(batchSize int64, values []int32, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Int32ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Int32ColumnChunkReader) Type() parquet.Type
- type Int64ColumnChunkReader
- func (c *Int64ColumnChunkReader) Descriptor() *schema.Column
- func (c *Int64ColumnChunkReader) Err() error
- func (c *Int64ColumnChunkReader) HasNext() bool
- func (cr *Int64ColumnChunkReader) ReadBatch(batchSize int64, values []int64, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Int64ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Int64ColumnChunkReader) Type() parquet.Type
- type Int96ColumnChunkReader
- func (c *Int96ColumnChunkReader) Descriptor() *schema.Column
- func (c *Int96ColumnChunkReader) Err() error
- func (c *Int96ColumnChunkReader) HasNext() bool
- func (cr *Int96ColumnChunkReader) ReadBatch(batchSize int64, values []parquet.Int96, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
- func (cr *Int96ColumnChunkReader) Skip(nvalues int64) (int64, error)
- func (c *Int96ColumnChunkReader) Type() parquet.Type
- type LevelInfo
- type Page
- type PageReader
- type ReadOption
- type Reader
- type RowGroupReader
- func (r *RowGroupReader) ByteSize() int64
- func (r *RowGroupReader) Column(i int) ColumnChunkReader
- func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error)
- func (r *RowGroupReader) MetaData() *metadata.RowGroupMetaData
- func (r *RowGroupReader) NumColumns() int
- func (r *RowGroupReader) NumRows() int64
- type ValidityBitmapInputOutput
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func DefLevelsToBitmap ¶
func DefLevelsToBitmap(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput)
DefLevelsToBitmap creates a validitybitmap out of the passed in definition levels and info object.
func DefRepLevelsToBitmap ¶
func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) error
DefRepLevelsToBitmap constructs a full validitybitmap out of the definition and repetition levels properly handling nested lists and parents.
func DefRepLevelsToListInfo ¶
func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, offsets []int32) error
DefRepLevelsToListInfo takes in the definition and repetition levels in order to populate the validity bitmap and properly handle nested lists and update the offsets for them.
Types ¶
type BooleanColumnChunkReader ¶
type BooleanColumnChunkReader struct {
// contains filtered or unexported fields
}
BooleanColumnChunkReader is the Typed Column chunk reader instance for reading Boolean column data.
func (*BooleanColumnChunkReader) Descriptor ¶
func (*BooleanColumnChunkReader) HasNext ¶
func (c *BooleanColumnChunkReader) HasNext() bool
HasNext returns whether there is more data to be read in this column and row group.
func (*BooleanColumnChunkReader) ReadBatch ¶
func (cr *BooleanColumnChunkReader) ReadBatch(batchSize int64, values []bool, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
ReadBatch reads batchSize values from the column.
Returns error if values is not at least big enough to hold the number of values that will be read.
defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.
total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls
type ByteArrayColumnChunkReader ¶
type ByteArrayColumnChunkReader struct {
// contains filtered or unexported fields
}
ByteArrayColumnChunkReader is the Typed Column chunk reader instance for reading ByteArray column data.
func (*ByteArrayColumnChunkReader) Descriptor ¶
func (*ByteArrayColumnChunkReader) HasNext ¶
func (c *ByteArrayColumnChunkReader) HasNext() bool
HasNext returns whether there is more data to be read in this column and row group.
func (*ByteArrayColumnChunkReader) ReadBatch ¶
func (cr *ByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.ByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
ReadBatch reads batchSize values from the column.
Returns error if values is not at least big enough to hold the number of values that will be read.
defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.
total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls
type ColumnChunkReader ¶
type ColumnChunkReader interface { // HasNext returns whether there is more data to be read in this column // and row group. HasNext() bool // Type returns the underlying physical type of the column Type() parquet.Type // Descriptor returns the column schema container Descriptor() *schema.Column // if HasNext returns false because of an error, this will return the error // it encountered. Otherwise this will be nil if it's just the end of the // column Err() error // contains filtered or unexported methods }
ColumnChunkReader is the basic interface for all column readers. It will use a page reader to read all the pages in a column chunk from a row group.
To actually Read out the column data, you need to convert to the properly typed ColumnChunkReader type such as *BooleanColumnReader etc.
Some things to clarify when working with column readers:
"Values" refers to the physical data values in a data page.
This is separate from the number of "rows" in a column and the total number of "elements" in a column because null values aren't stored physically in the data page but are represented via definition levels, so the number of values in a column can be less than the number of rows.
The total number of "elements" in a column also differs because of potential repeated fields, where you can have multiple values in the page which together make up a single element (such as a list) or depending on the repetition level and definition level, could represent an entire null list or just a null element inside of a list.
func NewColumnReader ¶
func NewColumnReader(descr *schema.Column, pageReader PageReader, mem memory.Allocator) ColumnChunkReader
NewColumnReader returns a column reader for the provided column initialized with the given pagereader that will provide the pages of data for this column. The type is determined from the column passed in.
type CryptoContext ¶
type CryptoContext struct { StartDecryptWithDictionaryPage bool RowGroupOrdinal int16 ColumnOrdinal int16 MetaDecryptor encryption.Decryptor DataDecryptor encryption.Decryptor }
CryptoContext is a context for keeping track of the current methods for decrypting. It keeps track of the row group and column numbers along with references to the decryptor objects.
type DataPage ¶
type DataPage interface { Page UncompressedSize() int64 Statistics() metadata.EncodedStatistics }
DataPage is the base interface for both DataPageV1 and DataPageV2 of the parquet spec.
type DataPageV1 ¶
type DataPageV1 struct {
// contains filtered or unexported fields
}
DataPageV1 represents a DataPage version 1 from the parquet.thrift file
func NewDataPageV1 ¶
func NewDataPageV1(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int64) *DataPageV1
NewDataPageV1 returns a V1 data page with the given buffer as its data and the specified encoding information
Will utilize objects that have been released back into the data page pool and re-use them if available as opposed to creating new objects. Calling Release on the data page object will release it back to the pool for re-use.
func NewDataPageV1WithStats ¶
func NewDataPageV1WithStats(buffer *memory.Buffer, num int32, encoding, defEncoding, repEncoding parquet.Encoding, uncompressedSize int64, stats metadata.EncodedStatistics) *DataPageV1
NewDataPageV1WithStats is the same as NewDataPageV1, but also allows adding the stat info into the created page
func (*DataPageV1) DefinitionLevelEncoding ¶
func (d *DataPageV1) DefinitionLevelEncoding() parquet.Encoding
DefinitionLevelEncoding returns the encoding utilized for the Definition Levels
func (*DataPageV1) Release ¶
func (d *DataPageV1) Release()
Release this page back into the DataPage object pool so that it can be reused.
After calling this function, the object should not be utilized anymore, otherwise conflicts can arise.
func (*DataPageV1) RepetitionLevelEncoding ¶
func (d *DataPageV1) RepetitionLevelEncoding() parquet.Encoding
RepetitionLevelEncoding returns the encoding utilized for the Repetition Levels
func (*DataPageV1) Statistics ¶
func (d *DataPageV1) Statistics() metadata.EncodedStatistics
Statistics returns the encoded statistics on this data page
func (*DataPageV1) UncompressedSize ¶
func (d *DataPageV1) UncompressedSize() int64
UncompressedSize returns the size of the data in this data page when uncompressed
type DataPageV2 ¶
type DataPageV2 struct {
// contains filtered or unexported fields
}
DataPageV2 is the representation of the V2 data page from the parquet.thrift spec
func NewDataPageV2 ¶
func NewDataPageV2(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen int32, uncompressed int64, isCompressed bool) *DataPageV2
NewDataPageV2 constructs a new V2 data page with the provided information and a buffer of the raw data.
func NewDataPageV2WithStats ¶
func NewDataPageV2WithStats(buffer *memory.Buffer, numValues, numNulls, numRows int32, encoding parquet.Encoding, defLvlsByteLen, repLvlsByteLen int32, uncompressed int64, isCompressed bool, stats metadata.EncodedStatistics) *DataPageV2
NewDataPageV2WithStats is the same as NewDataPageV2 but allows providing the encoded stats with the page.
func (*DataPageV2) DefinitionLevelByteLen ¶
func (d *DataPageV2) DefinitionLevelByteLen() int32
DefinitionLevelByteLen is the number of bytes in the buffer that are used to represent the definition levels
func (*DataPageV2) IsCompressed ¶
func (d *DataPageV2) IsCompressed() bool
IsCompressed returns true if the data of this page is compressed
func (*DataPageV2) NumNulls ¶
func (d *DataPageV2) NumNulls() int32
NumNulls is the reported number of nulls in this datapage
func (*DataPageV2) Release ¶
func (d *DataPageV2) Release()
Release this page back into the DataPage object pool so that it can be reused.
After calling this function, the object should not be utilized anymore, otherwise conflicts can arise.
func (*DataPageV2) RepetitionLevelByteLen ¶
func (d *DataPageV2) RepetitionLevelByteLen() int32
RepetitionLevelByteLen is the number of bytes in the buffer which are used to represent the repetition Levels
func (*DataPageV2) Statistics ¶
func (d *DataPageV2) Statistics() metadata.EncodedStatistics
Statistics are the encoded statistics in the data page
func (*DataPageV2) UncompressedSize ¶
func (d *DataPageV2) UncompressedSize() int64
UncompressedSize is the size of the raw page when uncompressed. If `IsCompressed` is true, then the raw data in the buffer is expected to be compressed.
type DictionaryPage ¶
type DictionaryPage struct {
// contains filtered or unexported fields
}
DictionaryPage represents the a page of data that uses dictionary encoding
func NewDictionaryPage ¶
func NewDictionaryPage(buffer *memory.Buffer, nvals int32, encoding parquet.Encoding) *DictionaryPage
NewDictionaryPage constructs a new dictionary page with the provided data buffer and number of values.
func (*DictionaryPage) IsSorted ¶
func (d *DictionaryPage) IsSorted() bool
IsSorted returns whether the dictionary itself is sorted
func (*DictionaryPage) Release ¶
func (d *DictionaryPage) Release()
Release this page back into the DataPage object pool so that it can be reused.
After calling this function, the object should not be utilized anymore, otherwise conflicts can arise.
type FixedLenByteArrayColumnChunkReader ¶
type FixedLenByteArrayColumnChunkReader struct {
// contains filtered or unexported fields
}
FixedLenByteArrayColumnChunkReader is the Typed Column chunk reader instance for reading FixedLenByteArray column data.
func (*FixedLenByteArrayColumnChunkReader) Descriptor ¶
func (*FixedLenByteArrayColumnChunkReader) Err ¶
func (c *FixedLenByteArrayColumnChunkReader) Err() error
func (*FixedLenByteArrayColumnChunkReader) HasNext ¶
func (c *FixedLenByteArrayColumnChunkReader) HasNext() bool
HasNext returns whether there is more data to be read in this column and row group.
func (*FixedLenByteArrayColumnChunkReader) ReadBatch ¶
func (cr *FixedLenByteArrayColumnChunkReader) ReadBatch(batchSize int64, values []parquet.FixedLenByteArray, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
ReadBatch reads batchSize values from the column.
Returns error if values is not at least big enough to hold the number of values that will be read.
defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.
total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls
type Float32ColumnChunkReader ¶
type Float32ColumnChunkReader struct {
// contains filtered or unexported fields
}
Float32ColumnChunkReader is the Typed Column chunk reader instance for reading Float32 column data.
func (*Float32ColumnChunkReader) Descriptor ¶
func (*Float32ColumnChunkReader) HasNext ¶
func (c *Float32ColumnChunkReader) HasNext() bool
HasNext returns whether there is more data to be read in this column and row group.
func (*Float32ColumnChunkReader) ReadBatch ¶
func (cr *Float32ColumnChunkReader) ReadBatch(batchSize int64, values []float32, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
ReadBatch reads batchSize values from the column.
Returns error if values is not at least big enough to hold the number of values that will be read.
defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.
total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls
type Float64ColumnChunkReader ¶
type Float64ColumnChunkReader struct {
// contains filtered or unexported fields
}
Float64ColumnChunkReader is the Typed Column chunk reader instance for reading Float64 column data.
func (*Float64ColumnChunkReader) Descriptor ¶
func (*Float64ColumnChunkReader) HasNext ¶
func (c *Float64ColumnChunkReader) HasNext() bool
HasNext returns whether there is more data to be read in this column and row group.
func (*Float64ColumnChunkReader) ReadBatch ¶
func (cr *Float64ColumnChunkReader) ReadBatch(batchSize int64, values []float64, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
ReadBatch reads batchSize values from the column.
Returns error if values is not at least big enough to hold the number of values that will be read.
defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.
total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls
type Int32ColumnChunkReader ¶
type Int32ColumnChunkReader struct {
// contains filtered or unexported fields
}
Int32ColumnChunkReader is the Typed Column chunk reader instance for reading Int32 column data.
func (*Int32ColumnChunkReader) Descriptor ¶
func (*Int32ColumnChunkReader) HasNext ¶
func (c *Int32ColumnChunkReader) HasNext() bool
HasNext returns whether there is more data to be read in this column and row group.
func (*Int32ColumnChunkReader) ReadBatch ¶
func (cr *Int32ColumnChunkReader) ReadBatch(batchSize int64, values []int32, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
ReadBatch reads batchSize values from the column.
Returns error if values is not at least big enough to hold the number of values that will be read.
defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.
total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls
type Int64ColumnChunkReader ¶
type Int64ColumnChunkReader struct {
// contains filtered or unexported fields
}
Int64ColumnChunkReader is the Typed Column chunk reader instance for reading Int64 column data.
func (*Int64ColumnChunkReader) Descriptor ¶
func (*Int64ColumnChunkReader) HasNext ¶
func (c *Int64ColumnChunkReader) HasNext() bool
HasNext returns whether there is more data to be read in this column and row group.
func (*Int64ColumnChunkReader) ReadBatch ¶
func (cr *Int64ColumnChunkReader) ReadBatch(batchSize int64, values []int64, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
ReadBatch reads batchSize values from the column.
Returns error if values is not at least big enough to hold the number of values that will be read.
defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.
total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls
type Int96ColumnChunkReader ¶
type Int96ColumnChunkReader struct {
// contains filtered or unexported fields
}
Int96ColumnChunkReader is the Typed Column chunk reader instance for reading Int96 column data.
func (*Int96ColumnChunkReader) Descriptor ¶
func (*Int96ColumnChunkReader) HasNext ¶
func (c *Int96ColumnChunkReader) HasNext() bool
HasNext returns whether there is more data to be read in this column and row group.
func (*Int96ColumnChunkReader) ReadBatch ¶
func (cr *Int96ColumnChunkReader) ReadBatch(batchSize int64, values []parquet.Int96, defLvls, repLvls []int16) (total int64, valuesRead int, err error)
ReadBatch reads batchSize values from the column.
Returns error if values is not at least big enough to hold the number of values that will be read.
defLvls and repLvls can be nil, or will be populated if not nil. If not nil, they must be at least large enough to hold the number of values that will be read.
total is the number of rows that were read, valuesRead is the actual number of physical values that were read excluding nulls
type LevelInfo ¶
type LevelInfo struct { // How many slots an undefined but present (i.e. null) element in // parquet consumes when decoding to Arrow. // "Slot" is used in the same context as the Arrow specification // (i.e. a value holder). // This is only ever >1 for descendents of FixedSizeList. NullSlotUsage int32 // The definition level at which the value for the field // is considered not null (definition levels greater than // or equal to this value indicate a not-null // value for the field). For list fields definition levels // greater than or equal to this field indicate a present, // possibly null, child value. DefLevel int16 // The repetition level corresponding to this element // or the closest repeated ancestor. Any repetition // level less than this indicates either a new list OR // an empty list (which is determined in conjunction // with definition levels). RepLevel int16 // The definition level indicating the level at which the closest // repeated ancestor is not empty. This is used to discriminate // between a value less than |def_level| being null or excluded entirely. // For instance if we have an arrow schema like: // list(struct(f0: int)). Then then there are the following // definition levels: // 0 = null list // 1 = present but empty list. // 2 = a null value in the list // 3 = a non null struct but null integer. // 4 = a present integer. // When reconstructing, the struct and integer arrays' // repeated_ancestor_def_level would be 2. Any // def_level < 2 indicates that there isn't a corresponding // child value in the list. // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]] // has the def levels [0, 1, 2, 3, 4]. The actual // struct array is only of length 3: [not-set, set, set] and // the int array is also of length 3: [N/A, null, 1]. RepeatedAncestorDefLevel int16 }
func (*LevelInfo) HasNullableValues ¶
func (*LevelInfo) IncrementOptional ¶
func (l *LevelInfo) IncrementOptional()
func (*LevelInfo) IncrementRepeated ¶
type Page ¶
type Page interface { // Returns which kind of page this is Type() format.PageType // Get the raw bytes of this page Data() []byte // return the encoding used for this page, Plain/RLE, etc. Encoding() format.Encoding // get the number of values in this page NumValues() int32 // release this page object back into the page pool for re-use Release() }
Page is an interface for handling DataPages or Dictionary Pages
type PageReader ¶
type PageReader interface { // Set the maximum Page header size allowed to be read SetMaxPageHeaderSize(int) // Return the current page, or nil if there are no more Page() Page // Fetch the next page, returns false if there are no more pages Next() bool // if Next returns false, Err will return the error encountered or // nil if there was no error and you just hit the end of the page Err() error // Reset allows reusing a page reader Reset(r parquet.ReaderAtSeeker, nrows int64, compressType compress.Compression, ctx *CryptoContext) }
PageReader is the interface used by the columnreader in order to read and handle DataPages and loop through them.
func NewPageReader ¶
func NewPageReader(r parquet.ReaderAtSeeker, nrows int64, compressType compress.Compression, mem memory.Allocator, ctx *CryptoContext) (PageReader, error)
NewPageReader returns a page reader for the data which can be read from the provided reader and compression.
type ReadOption ¶
type ReadOption func(*Reader)
func WithMetadata ¶
func WithMetadata(m *metadata.FileMetaData) ReadOption
WithMetadata allows providing a specific FileMetaData object rather than reading the file metadata from the file itself.
func WithReadProps ¶
func WithReadProps(props *parquet.ReaderProperties) ReadOption
WithReadProps specifies a specific reader properties instance to use, rather than using the default ReaderProperties.
type Reader ¶
type Reader struct {
// contains filtered or unexported fields
}
Reader is the main interface for reading a parquet file
func NewParquetReader ¶
func NewParquetReader(r parquet.ReaderAtSeeker, opts ...ReadOption) (*Reader, error)
NewParquetReader returns a FileReader instance that reads a parquet file which can be read from r. This reader needs to support Read, ReadAt and Seeking.
If no read properties are provided then the default ReaderProperties will be used. The WithMetadata option can be used to provide a FileMetaData object rather than reading the file metadata from the file.
func OpenParquetFile ¶
func OpenParquetFile(filename string, memoryMap bool, opts ...ReadOption) (*Reader, error)
OpenParquetFile will return a Reader for the given parquet file on the local file system.
Optionally the file can be memory mapped for faster reading. If no read properties are provided then the default ReaderProperties will be used. The WithMetadata option can be used to provide a FileMetaData object rather than reading the file metadata from the file.
func (*Reader) Close ¶
Close will close the current reader, and if the underlying reader being used is an `io.Closer` then Close will be called on it too.
func (*Reader) MetaData ¶
func (f *Reader) MetaData() *metadata.FileMetaData
MetaData returns the underlying FileMetadata object
func (*Reader) NumRowGroups ¶
NumRowGroups returns the total number of row groups in this file.
func (*Reader) RowGroup ¶
func (f *Reader) RowGroup(i int) *RowGroupReader
RowGroup returns a reader for the desired (0-based) row group
func (*Reader) WriterVersion ¶
func (f *Reader) WriterVersion() *metadata.AppVersion
WriterVersion returns the Application Version that was written in the file metadata
type RowGroupReader ¶
type RowGroupReader struct {
// contains filtered or unexported fields
}
RowGroupReader is the primary interface for reading a single row group
func (*RowGroupReader) ByteSize ¶
func (r *RowGroupReader) ByteSize() int64
ByteSize returns the full byte size of this row group as defined in its metadata
func (*RowGroupReader) Column ¶
func (r *RowGroupReader) Column(i int) ColumnChunkReader
Column returns a column reader for the requested (0-indexed) column
panics if passed a column not in the range [0, NumColumns)
func (*RowGroupReader) GetColumnPageReader ¶
func (r *RowGroupReader) GetColumnPageReader(i int) (PageReader, error)
func (*RowGroupReader) MetaData ¶
func (r *RowGroupReader) MetaData() *metadata.RowGroupMetaData
MetaData returns the metadata of the current Row Group
func (*RowGroupReader) NumColumns ¶
func (r *RowGroupReader) NumColumns() int
NumColumns returns the number of columns of data as defined in the metadata of this row group
func (*RowGroupReader) NumRows ¶
func (r *RowGroupReader) NumRows() int64
NumRows returns the number of rows in just this row group
type ValidityBitmapInputOutput ¶
type ValidityBitmapInputOutput struct { // Input only. // The maximum number of values_read expected (actual // values read must be less than or equal to this value). // If this number is exceeded methods will throw a // ParquetException. Exceeding this limit indicates // either a corrupt or incorrectly written file. ReadUpperBound int64 // Output only. The number of values added to the encountered // (this is logically the count of the number of elements // for an Arrow array). Read int64 // Input/Output. The number of nulls encountered. NullCount int64 // Output only. The validity bitmap to populate. May be be null only // for DefRepLevelsToListInfo (if all that is needed is list offsets). ValidBits []byte // Input only, offset into valid_bits to start at. ValidBitsOffset int64 }
Input/Output structure for reconstructed validity bitmaps.