492 lines
18 KiB
JavaScript
492 lines
18 KiB
JavaScript
"use strict";
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
const codec_1 = require("./codec");
|
|
const Compression = require("./compression");
|
|
const schema_1 = require("./schema");
|
|
const Shred = require("./shred");
|
|
// tslint:disable-next-line:max-line-length
|
|
const thrift_1 = require("./thrift");
|
|
const Util = require("./util");
|
|
// import Fs = require('fs');
|
|
/**
|
|
* Parquet File Magic String
|
|
*/
|
|
const PARQUET_MAGIC = 'PAR1';
|
|
/**
|
|
* Parquet File Format Version
|
|
*/
|
|
const PARQUET_VERSION = 1;
|
|
/**
|
|
* Internal type used for repetition/definition levels
|
|
*/
|
|
const PARQUET_RDLVL_TYPE = 'INT32';
|
|
const PARQUET_RDLVL_ENCODING = 'RLE';
|
|
/**
|
|
* A parquet cursor is used to retrieve rows from a parquet file in order
|
|
*/
|
|
class ParquetCursor {
|
|
/**
|
|
* Create a new parquet reader from the file metadata and an envelope reader.
|
|
* It is usually not recommended to call this constructor directly except for
|
|
* advanced and internal use cases. Consider using getCursor() on the
|
|
* ParquetReader instead
|
|
*/
|
|
constructor(metadata, envelopeReader, schema, columnList) {
|
|
this.metadata = metadata;
|
|
this.envelopeReader = envelopeReader;
|
|
this.schema = schema;
|
|
this.columnList = columnList;
|
|
this.rowGroup = [];
|
|
this.rowGroupIndex = 0;
|
|
}
|
|
/**
|
|
* Retrieve the next row from the cursor. Returns a row or NULL if the end
|
|
* of the file was reached
|
|
*/
|
|
async next() {
|
|
if (this.rowGroup.length === 0) {
|
|
if (this.rowGroupIndex >= this.metadata.row_groups.length) {
|
|
return null;
|
|
}
|
|
const rowBuffer = await this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList);
|
|
this.rowGroup = Shred.materializeRecords(this.schema, rowBuffer);
|
|
this.rowGroupIndex++;
|
|
}
|
|
return this.rowGroup.shift();
|
|
}
|
|
/**
|
|
* Rewind the cursor the the beginning of the file
|
|
*/
|
|
rewind() {
|
|
this.rowGroup = [];
|
|
this.rowGroupIndex = 0;
|
|
}
|
|
/**
|
|
* Implement AsyncIterable
|
|
*/
|
|
// tslint:disable-next-line:function-name
|
|
[Symbol.asyncIterator]() {
|
|
let done = false;
|
|
return {
|
|
next: async () => {
|
|
if (done) {
|
|
return { done, value: null };
|
|
}
|
|
const value = await this.next();
|
|
if (value === null) {
|
|
return { done: true, value };
|
|
}
|
|
return { done: false, value };
|
|
},
|
|
return: async () => {
|
|
done = true;
|
|
return { done, value: null };
|
|
},
|
|
throw: async () => {
|
|
done = true;
|
|
return { done: true, value: null };
|
|
}
|
|
};
|
|
}
|
|
}
|
|
exports.ParquetCursor = ParquetCursor;
|
|
/**
|
|
* A parquet reader allows retrieving the rows from a parquet file in order.
|
|
* The basic usage is to create a reader and then retrieve a cursor/iterator
|
|
* which allows you to consume row after row until all rows have been read. It is
|
|
* important that you call close() after you are finished reading the file to
|
|
* avoid leaking file descriptors.
|
|
*/
|
|
class ParquetReader {
|
|
/**
|
|
* Create a new parquet reader from the file metadata and an envelope reader.
|
|
* It is not recommended to call this constructor directly except for advanced
|
|
* and internal use cases. Consider using one of the open{File,Buffer} methods
|
|
* instead
|
|
*/
|
|
constructor(metadata, envelopeReader) {
|
|
if (metadata.version !== PARQUET_VERSION) {
|
|
throw new Error('invalid parquet version');
|
|
}
|
|
this.metadata = metadata;
|
|
this.envelopeReader = envelopeReader;
|
|
const root = this.metadata.schema[0];
|
|
const { schema } = decodeSchema(this.metadata.schema, 1, root.num_children);
|
|
this.schema = new schema_1.ParquetSchema(schema);
|
|
}
|
|
/**
|
|
* Open the parquet file pointed to by the specified path and return a new
|
|
* parquet reader
|
|
*/
|
|
static async openFile(filePath) {
|
|
const envelopeReader = await ParquetEnvelopeReader.openFile(filePath);
|
|
try {
|
|
await envelopeReader.readHeader();
|
|
const metadata = await envelopeReader.readFooter();
|
|
return new ParquetReader(metadata, envelopeReader);
|
|
}
|
|
catch (err) {
|
|
await envelopeReader.close();
|
|
throw err;
|
|
}
|
|
}
|
|
static async openBuffer(buffer) {
|
|
const envelopeReader = await ParquetEnvelopeReader.openBuffer(buffer);
|
|
try {
|
|
await envelopeReader.readHeader();
|
|
const metadata = await envelopeReader.readFooter();
|
|
return new ParquetReader(metadata, envelopeReader);
|
|
}
|
|
catch (err) {
|
|
await envelopeReader.close();
|
|
throw err;
|
|
}
|
|
}
|
|
getCursor(columnList) {
|
|
if (!columnList) {
|
|
// tslint:disable-next-line:no-parameter-reassignment
|
|
columnList = [];
|
|
}
|
|
// tslint:disable-next-line:no-parameter-reassignment
|
|
columnList = columnList.map(x => Array.isArray(x) ? x : [x]);
|
|
return new ParquetCursor(this.metadata, this.envelopeReader, this.schema, columnList);
|
|
}
|
|
/**
|
|
* Return the number of rows in this file. Note that the number of rows is
|
|
* not neccessarily equal to the number of rows in each column.
|
|
*/
|
|
getRowCount() {
|
|
return +this.metadata.num_rows;
|
|
}
|
|
/**
|
|
* Returns the ParquetSchema for this file
|
|
*/
|
|
getSchema() {
|
|
return this.schema;
|
|
}
|
|
/**
|
|
* Returns the user (key/value) metadata for this file
|
|
*/
|
|
getMetadata() {
|
|
const md = {};
|
|
for (const kv of this.metadata.key_value_metadata) {
|
|
md[kv.key] = kv.value;
|
|
}
|
|
return md;
|
|
}
|
|
/**
|
|
* Close this parquet reader. You MUST call this method once you're finished
|
|
* reading rows
|
|
*/
|
|
async close() {
|
|
await this.envelopeReader.close();
|
|
this.envelopeReader = null;
|
|
this.metadata = null;
|
|
}
|
|
/**
|
|
* Implement AsyncIterable
|
|
*/
|
|
// tslint:disable-next-line:function-name
|
|
[Symbol.asyncIterator]() {
|
|
return this.getCursor()[Symbol.asyncIterator]();
|
|
}
|
|
}
|
|
exports.ParquetReader = ParquetReader;
|
|
/**
|
|
* The parquet envelope reader allows direct, unbuffered access to the individual
|
|
* sections of the parquet file, namely the header, footer and the row groups.
|
|
* This class is intended for advanced/internal users; if you just want to retrieve
|
|
* rows from a parquet file use the ParquetReader instead
|
|
*/
|
|
class ParquetEnvelopeReader {
|
|
constructor(read, close, fileSize) {
|
|
this.read = read;
|
|
this.close = close;
|
|
this.fileSize = fileSize;
|
|
}
|
|
static async openFile(filePath) {
|
|
const fileStat = await Util.fstat(filePath);
|
|
const fileDescriptor = await Util.fopen(filePath);
|
|
const readFn = Util.fread.bind(undefined, fileDescriptor);
|
|
const closeFn = Util.fclose.bind(undefined, fileDescriptor);
|
|
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size);
|
|
}
|
|
static async openBuffer(buffer) {
|
|
const readFn = (position, length) => Promise.resolve(buffer.slice(position, position + length));
|
|
const closeFn = () => Promise.resolve();
|
|
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length);
|
|
}
|
|
async readHeader() {
|
|
const buf = await this.read(0, PARQUET_MAGIC.length);
|
|
if (buf.toString() !== PARQUET_MAGIC) {
|
|
throw new Error('not valid parquet file');
|
|
}
|
|
}
|
|
async readRowGroup(schema, rowGroup, columnList) {
|
|
const buffer = {
|
|
rowCount: +rowGroup.num_rows,
|
|
columnData: {}
|
|
};
|
|
for (const colChunk of rowGroup.columns) {
|
|
const colMetadata = colChunk.meta_data;
|
|
const colKey = colMetadata.path_in_schema;
|
|
if (columnList.length > 0 && Util.fieldIndexOf(columnList, colKey) < 0) {
|
|
continue;
|
|
}
|
|
buffer.columnData[colKey.join()] = await this.readColumnChunk(schema, colChunk);
|
|
}
|
|
return buffer;
|
|
}
|
|
async readColumnChunk(schema, colChunk) {
|
|
if (colChunk.file_path !== undefined && colChunk.file_path !== null) {
|
|
throw new Error('external references are not supported');
|
|
}
|
|
const field = schema.findField(colChunk.meta_data.path_in_schema);
|
|
const type = Util.getThriftEnum(thrift_1.Type, colChunk.meta_data.type);
|
|
if (type !== field.primitiveType)
|
|
throw new Error('chunk type not matching schema: ' + type);
|
|
const compression = Util.getThriftEnum(thrift_1.CompressionCodec, colChunk.meta_data.codec);
|
|
const pagesOffset = +colChunk.meta_data.data_page_offset;
|
|
const pagesSize = +colChunk.meta_data.total_compressed_size;
|
|
const pagesBuf = await this.read(pagesOffset, pagesSize);
|
|
return decodeDataPages(pagesBuf, field, compression);
|
|
}
|
|
async readFooter() {
|
|
const trailerLen = PARQUET_MAGIC.length + 4;
|
|
const trailerBuf = await this.read(this.fileSize - trailerLen, trailerLen);
|
|
if (trailerBuf.slice(4).toString() !== PARQUET_MAGIC) {
|
|
throw new Error('not a valid parquet file');
|
|
}
|
|
const metadataSize = trailerBuf.readUInt32LE(0);
|
|
const metadataOffset = this.fileSize - metadataSize - trailerLen;
|
|
if (metadataOffset < PARQUET_MAGIC.length) {
|
|
throw new Error('invalid metadata size');
|
|
}
|
|
const metadataBuf = await this.read(metadataOffset, metadataSize);
|
|
// let metadata = new parquet_thrift.FileMetaData();
|
|
// parquet_util.decodeThrift(metadata, metadataBuf);
|
|
const { metadata } = Util.decodeFileMetadata(metadataBuf);
|
|
return metadata;
|
|
}
|
|
}
|
|
exports.ParquetEnvelopeReader = ParquetEnvelopeReader;
|
|
/**
|
|
* Decode a consecutive array of data using one of the parquet encodings
|
|
*/
|
|
function decodeValues(type, encoding, cursor, count, opts) {
|
|
if (!(encoding in codec_1.PARQUET_CODEC)) {
|
|
throw new Error(`invalid encoding: ${encoding}`);
|
|
}
|
|
return codec_1.PARQUET_CODEC[encoding].decodeValues(type, cursor, count, opts);
|
|
}
|
|
function decodeDataPages(buffer, column, compression) {
|
|
const cursor = {
|
|
buffer,
|
|
offset: 0,
|
|
size: buffer.length
|
|
};
|
|
const data = {
|
|
rlevels: [],
|
|
dlevels: [],
|
|
values: [],
|
|
count: 0
|
|
};
|
|
while (cursor.offset < cursor.size) {
|
|
// const pageHeader = new parquet_thrift.PageHeader();
|
|
// cursor.offset += parquet_util.decodeThrift(pageHeader, cursor.buffer);
|
|
const { pageHeader, length } = Util.decodePageHeader(cursor.buffer);
|
|
cursor.offset += length;
|
|
const pageType = Util.getThriftEnum(thrift_1.PageType, pageHeader.type);
|
|
let pageData = null;
|
|
switch (pageType) {
|
|
case 'DATA_PAGE':
|
|
pageData = decodeDataPage(cursor, pageHeader, column, compression);
|
|
break;
|
|
case 'DATA_PAGE_V2':
|
|
pageData = decodeDataPageV2(cursor, pageHeader, column, compression);
|
|
break;
|
|
default:
|
|
throw new Error(`invalid page type: ${pageType}`);
|
|
}
|
|
Array.prototype.push.apply(data.rlevels, pageData.rlevels);
|
|
Array.prototype.push.apply(data.dlevels, pageData.dlevels);
|
|
Array.prototype.push.apply(data.values, pageData.values);
|
|
data.count += pageData.count;
|
|
}
|
|
return data;
|
|
}
|
|
function decodeDataPage(cursor, header, column, compression) {
|
|
const cursorEnd = cursor.offset + header.compressed_page_size;
|
|
const valueCount = header.data_page_header.num_values;
|
|
// const info = {
|
|
// path: opts.column.path.join('.'),
|
|
// valueEncoding,
|
|
// dLevelEncoding,
|
|
// rLevelEncoding,
|
|
// cursorOffset: cursor.offset,
|
|
// cursorEnd,
|
|
// cusrorSize: cursor.size,
|
|
// header,
|
|
// opts,
|
|
// buffer: cursor.buffer.toJSON(),
|
|
// values: null as any[],
|
|
// valBuf: null as any
|
|
// };
|
|
// Fs.writeFileSync(`dump/${info.path}.ts.json`, JSON.stringify(info, null, 2));
|
|
/* uncompress page */
|
|
let dataCursor = cursor;
|
|
if (compression !== 'UNCOMPRESSED') {
|
|
const valuesBuf = Compression.inflate(compression, cursor.buffer.slice(cursor.offset, cursorEnd), header.uncompressed_page_size);
|
|
dataCursor = {
|
|
buffer: valuesBuf,
|
|
offset: 0,
|
|
size: valuesBuf.length
|
|
};
|
|
cursor.offset = cursorEnd;
|
|
}
|
|
/* read repetition levels */
|
|
const rLevelEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.repetition_level_encoding);
|
|
// tslint:disable-next-line:prefer-array-literal
|
|
let rLevels = new Array(valueCount);
|
|
if (column.rLevelMax > 0) {
|
|
rLevels = decodeValues(PARQUET_RDLVL_TYPE, rLevelEncoding, dataCursor, valueCount, {
|
|
bitWidth: Util.getBitWidth(column.rLevelMax),
|
|
disableEnvelope: false
|
|
// column: opts.column
|
|
});
|
|
}
|
|
else {
|
|
rLevels.fill(0);
|
|
}
|
|
/* read definition levels */
|
|
const dLevelEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.definition_level_encoding);
|
|
// tslint:disable-next-line:prefer-array-literal
|
|
let dLevels = new Array(valueCount);
|
|
if (column.dLevelMax > 0) {
|
|
dLevels = decodeValues(PARQUET_RDLVL_TYPE, dLevelEncoding, dataCursor, valueCount, {
|
|
bitWidth: Util.getBitWidth(column.dLevelMax),
|
|
disableEnvelope: false
|
|
// column: opts.column
|
|
});
|
|
}
|
|
else {
|
|
dLevels.fill(0);
|
|
}
|
|
let valueCountNonNull = 0;
|
|
for (const dlvl of dLevels) {
|
|
if (dlvl === column.dLevelMax) {
|
|
valueCountNonNull++;
|
|
}
|
|
}
|
|
/* read values */
|
|
const valueEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.encoding);
|
|
const values = decodeValues(column.primitiveType, valueEncoding, dataCursor, valueCountNonNull, {
|
|
typeLength: column.typeLength,
|
|
bitWidth: column.typeLength
|
|
});
|
|
// info.valBuf = uncursor.buffer.toJSON();
|
|
// info.values = values;
|
|
// Fs.writeFileSync(`dump/${info.path}.ts.json`, JSON.stringify(info, null, 2));
|
|
return {
|
|
dlevels: dLevels,
|
|
rlevels: rLevels,
|
|
values,
|
|
count: valueCount
|
|
};
|
|
}
|
|
function decodeDataPageV2(cursor, header, column, compression) {
|
|
const cursorEnd = cursor.offset + header.compressed_page_size;
|
|
const valueCount = header.data_page_header_v2.num_values;
|
|
const valueCountNonNull = valueCount - header.data_page_header_v2.num_nulls;
|
|
const valueEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header_v2.encoding);
|
|
/* read repetition levels */
|
|
// tslint:disable-next-line:prefer-array-literal
|
|
let rLevels = new Array(valueCount);
|
|
if (column.rLevelMax > 0) {
|
|
rLevels = decodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, cursor, valueCount, {
|
|
bitWidth: Util.getBitWidth(column.rLevelMax),
|
|
disableEnvelope: true
|
|
});
|
|
}
|
|
else {
|
|
rLevels.fill(0);
|
|
}
|
|
/* read definition levels */
|
|
// tslint:disable-next-line:prefer-array-literal
|
|
let dLevels = new Array(valueCount);
|
|
if (column.dLevelMax > 0) {
|
|
dLevels = decodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, cursor, valueCount, {
|
|
bitWidth: Util.getBitWidth(column.dLevelMax),
|
|
disableEnvelope: true
|
|
});
|
|
}
|
|
else {
|
|
dLevels.fill(0);
|
|
}
|
|
/* read values */
|
|
let valuesBufCursor = cursor;
|
|
if (header.data_page_header_v2.is_compressed) {
|
|
const valuesBuf = Compression.inflate(compression, cursor.buffer.slice(cursor.offset, cursorEnd), header.uncompressed_page_size);
|
|
valuesBufCursor = {
|
|
buffer: valuesBuf,
|
|
offset: 0,
|
|
size: valuesBuf.length
|
|
};
|
|
cursor.offset = cursorEnd;
|
|
}
|
|
const values = decodeValues(column.primitiveType, valueEncoding, valuesBufCursor, valueCountNonNull, {
|
|
typeLength: column.typeLength,
|
|
bitWidth: column.typeLength
|
|
});
|
|
return {
|
|
dlevels: dLevels,
|
|
rlevels: rLevels,
|
|
values,
|
|
count: valueCount
|
|
};
|
|
}
|
|
function decodeSchema(schemaElements, offset, len) {
|
|
const schema = {};
|
|
let next = offset;
|
|
for (let i = 0; i < len; i++) {
|
|
const schemaElement = schemaElements[next];
|
|
const repetitionType = next > 0 ? Util.getThriftEnum(thrift_1.FieldRepetitionType, schemaElement.repetition_type) : 'ROOT';
|
|
let optional = false;
|
|
let repeated = false;
|
|
switch (repetitionType) {
|
|
case 'REQUIRED':
|
|
break;
|
|
case 'OPTIONAL':
|
|
optional = true;
|
|
break;
|
|
case 'REPEATED':
|
|
repeated = true;
|
|
break;
|
|
}
|
|
if (schemaElement.num_children > 0) {
|
|
const res = decodeSchema(schemaElements, next + 1, schemaElement.num_children);
|
|
next = res.next;
|
|
schema[schemaElement.name] = {
|
|
// type: undefined,
|
|
optional,
|
|
repeated,
|
|
fields: res.schema
|
|
};
|
|
}
|
|
else {
|
|
let logicalType = Util.getThriftEnum(thrift_1.Type, schemaElement.type);
|
|
if (schemaElement.converted_type != null) {
|
|
logicalType = Util.getThriftEnum(thrift_1.ConvertedType, schemaElement.converted_type);
|
|
}
|
|
schema[schemaElement.name] = {
|
|
type: logicalType,
|
|
typeLength: schemaElement.type_length,
|
|
optional,
|
|
repeated
|
|
};
|
|
next++;
|
|
}
|
|
}
|
|
return { schema, offset, next };
|
|
}
|
|
//# sourceMappingURL=reader.js.map
|