"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const codec_1 = require("./codec"); const Compression = require("./compression"); const schema_1 = require("./schema"); const Shred = require("./shred"); // tslint:disable-next-line:max-line-length const thrift_1 = require("./thrift"); const Util = require("./util"); // import Fs = require('fs'); /** * Parquet File Magic String */ const PARQUET_MAGIC = 'PAR1'; /** * Parquet File Format Version */ const PARQUET_VERSION = 1; /** * Internal type used for repetition/definition levels */ const PARQUET_RDLVL_TYPE = 'INT32'; const PARQUET_RDLVL_ENCODING = 'RLE'; /** * A parquet cursor is used to retrieve rows from a parquet file in order */ class ParquetCursor { /** * Create a new parquet reader from the file metadata and an envelope reader. * It is usually not recommended to call this constructor directly except for * advanced and internal use cases. Consider using getCursor() on the * ParquetReader instead */ constructor(metadata, envelopeReader, schema, columnList) { this.metadata = metadata; this.envelopeReader = envelopeReader; this.schema = schema; this.columnList = columnList; this.rowGroup = []; this.rowGroupIndex = 0; } /** * Retrieve the next row from the cursor. Returns a row or NULL if the end * of the file was reached */ async next() { if (this.rowGroup.length === 0) { if (this.rowGroupIndex >= this.metadata.row_groups.length) { return null; } const rowBuffer = await this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList); this.rowGroup = Shred.materializeRecords(this.schema, rowBuffer); this.rowGroupIndex++; } return this.rowGroup.shift(); } /** * Rewind the cursor the the beginning of the file */ rewind() { this.rowGroup = []; this.rowGroupIndex = 0; } /** * Implement AsyncIterable */ // tslint:disable-next-line:function-name [Symbol.asyncIterator]() { let done = false; return { next: async () => { if (done) { return { done, value: null }; } const value = await this.next(); if (value === null) { return { done: true, value }; } return { done: false, value }; }, return: async () => { done = true; return { done, value: null }; }, throw: async () => { done = true; return { done: true, value: null }; } }; } } exports.ParquetCursor = ParquetCursor; /** * A parquet reader allows retrieving the rows from a parquet file in order. * The basic usage is to create a reader and then retrieve a cursor/iterator * which allows you to consume row after row until all rows have been read. It is * important that you call close() after you are finished reading the file to * avoid leaking file descriptors. */ class ParquetReader { /** * Create a new parquet reader from the file metadata and an envelope reader. * It is not recommended to call this constructor directly except for advanced * and internal use cases. Consider using one of the open{File,Buffer} methods * instead */ constructor(metadata, envelopeReader) { if (metadata.version !== PARQUET_VERSION) { throw new Error('invalid parquet version'); } this.metadata = metadata; this.envelopeReader = envelopeReader; const root = this.metadata.schema[0]; const { schema } = decodeSchema(this.metadata.schema, 1, root.num_children); this.schema = new schema_1.ParquetSchema(schema); } /** * Open the parquet file pointed to by the specified path and return a new * parquet reader */ static async openFile(filePath) { const envelopeReader = await ParquetEnvelopeReader.openFile(filePath); try { await envelopeReader.readHeader(); const metadata = await envelopeReader.readFooter(); return new ParquetReader(metadata, envelopeReader); } catch (err) { await envelopeReader.close(); throw err; } } static async openBuffer(buffer) { const envelopeReader = await ParquetEnvelopeReader.openBuffer(buffer); try { await envelopeReader.readHeader(); const metadata = await envelopeReader.readFooter(); return new ParquetReader(metadata, envelopeReader); } catch (err) { await envelopeReader.close(); throw err; } } getCursor(columnList) { if (!columnList) { // tslint:disable-next-line:no-parameter-reassignment columnList = []; } // tslint:disable-next-line:no-parameter-reassignment columnList = columnList.map(x => Array.isArray(x) ? x : [x]); return new ParquetCursor(this.metadata, this.envelopeReader, this.schema, columnList); } /** * Return the number of rows in this file. Note that the number of rows is * not neccessarily equal to the number of rows in each column. */ getRowCount() { return +this.metadata.num_rows; } /** * Returns the ParquetSchema for this file */ getSchema() { return this.schema; } /** * Returns the user (key/value) metadata for this file */ getMetadata() { const md = {}; for (const kv of this.metadata.key_value_metadata) { md[kv.key] = kv.value; } return md; } /** * Close this parquet reader. You MUST call this method once you're finished * reading rows */ async close() { await this.envelopeReader.close(); this.envelopeReader = null; this.metadata = null; } /** * Implement AsyncIterable */ // tslint:disable-next-line:function-name [Symbol.asyncIterator]() { return this.getCursor()[Symbol.asyncIterator](); } } exports.ParquetReader = ParquetReader; /** * The parquet envelope reader allows direct, unbuffered access to the individual * sections of the parquet file, namely the header, footer and the row groups. * This class is intended for advanced/internal users; if you just want to retrieve * rows from a parquet file use the ParquetReader instead */ class ParquetEnvelopeReader { constructor(read, close, fileSize) { this.read = read; this.close = close; this.fileSize = fileSize; } static async openFile(filePath) { const fileStat = await Util.fstat(filePath); const fileDescriptor = await Util.fopen(filePath); const readFn = Util.fread.bind(undefined, fileDescriptor); const closeFn = Util.fclose.bind(undefined, fileDescriptor); return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size); } static async openBuffer(buffer) { const readFn = (position, length) => Promise.resolve(buffer.slice(position, position + length)); const closeFn = () => Promise.resolve(); return new ParquetEnvelopeReader(readFn, closeFn, buffer.length); } async readHeader() { const buf = await this.read(0, PARQUET_MAGIC.length); if (buf.toString() !== PARQUET_MAGIC) { throw new Error('not valid parquet file'); } } async readRowGroup(schema, rowGroup, columnList) { const buffer = { rowCount: +rowGroup.num_rows, columnData: {} }; for (const colChunk of rowGroup.columns) { const colMetadata = colChunk.meta_data; const colKey = colMetadata.path_in_schema; if (columnList.length > 0 && Util.fieldIndexOf(columnList, colKey) < 0) { continue; } buffer.columnData[colKey.join()] = await this.readColumnChunk(schema, colChunk); } return buffer; } async readColumnChunk(schema, colChunk) { if (colChunk.file_path !== undefined && colChunk.file_path !== null) { throw new Error('external references are not supported'); } const field = schema.findField(colChunk.meta_data.path_in_schema); const type = Util.getThriftEnum(thrift_1.Type, colChunk.meta_data.type); if (type !== field.primitiveType) throw new Error('chunk type not matching schema: ' + type); const compression = Util.getThriftEnum(thrift_1.CompressionCodec, colChunk.meta_data.codec); const pagesOffset = +colChunk.meta_data.data_page_offset; const pagesSize = +colChunk.meta_data.total_compressed_size; const pagesBuf = await this.read(pagesOffset, pagesSize); return decodeDataPages(pagesBuf, field, compression); } async readFooter() { const trailerLen = PARQUET_MAGIC.length + 4; const trailerBuf = await this.read(this.fileSize - trailerLen, trailerLen); if (trailerBuf.slice(4).toString() !== PARQUET_MAGIC) { throw new Error('not a valid parquet file'); } const metadataSize = trailerBuf.readUInt32LE(0); const metadataOffset = this.fileSize - metadataSize - trailerLen; if (metadataOffset < PARQUET_MAGIC.length) { throw new Error('invalid metadata size'); } const metadataBuf = await this.read(metadataOffset, metadataSize); // let metadata = new parquet_thrift.FileMetaData(); // parquet_util.decodeThrift(metadata, metadataBuf); const { metadata } = Util.decodeFileMetadata(metadataBuf); return metadata; } } exports.ParquetEnvelopeReader = ParquetEnvelopeReader; /** * Decode a consecutive array of data using one of the parquet encodings */ function decodeValues(type, encoding, cursor, count, opts) { if (!(encoding in codec_1.PARQUET_CODEC)) { throw new Error(`invalid encoding: ${encoding}`); } return codec_1.PARQUET_CODEC[encoding].decodeValues(type, cursor, count, opts); } function decodeDataPages(buffer, column, compression) { const cursor = { buffer, offset: 0, size: buffer.length }; const data = { rlevels: [], dlevels: [], values: [], count: 0 }; while (cursor.offset < cursor.size) { // const pageHeader = new parquet_thrift.PageHeader(); // cursor.offset += parquet_util.decodeThrift(pageHeader, cursor.buffer); const { pageHeader, length } = Util.decodePageHeader(cursor.buffer); cursor.offset += length; const pageType = Util.getThriftEnum(thrift_1.PageType, pageHeader.type); let pageData = null; switch (pageType) { case 'DATA_PAGE': pageData = decodeDataPage(cursor, pageHeader, column, compression); break; case 'DATA_PAGE_V2': pageData = decodeDataPageV2(cursor, pageHeader, column, compression); break; default: throw new Error(`invalid page type: ${pageType}`); } Array.prototype.push.apply(data.rlevels, pageData.rlevels); Array.prototype.push.apply(data.dlevels, pageData.dlevels); Array.prototype.push.apply(data.values, pageData.values); data.count += pageData.count; } return data; } function decodeDataPage(cursor, header, column, compression) { const cursorEnd = cursor.offset + header.compressed_page_size; const valueCount = header.data_page_header.num_values; // const info = { // path: opts.column.path.join('.'), // valueEncoding, // dLevelEncoding, // rLevelEncoding, // cursorOffset: cursor.offset, // cursorEnd, // cusrorSize: cursor.size, // header, // opts, // buffer: cursor.buffer.toJSON(), // values: null as any[], // valBuf: null as any // }; // Fs.writeFileSync(`dump/${info.path}.ts.json`, JSON.stringify(info, null, 2)); /* uncompress page */ let dataCursor = cursor; if (compression !== 'UNCOMPRESSED') { const valuesBuf = Compression.inflate(compression, cursor.buffer.slice(cursor.offset, cursorEnd), header.uncompressed_page_size); dataCursor = { buffer: valuesBuf, offset: 0, size: valuesBuf.length }; cursor.offset = cursorEnd; } /* read repetition levels */ const rLevelEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.repetition_level_encoding); // tslint:disable-next-line:prefer-array-literal let rLevels = new Array(valueCount); if (column.rLevelMax > 0) { rLevels = decodeValues(PARQUET_RDLVL_TYPE, rLevelEncoding, dataCursor, valueCount, { bitWidth: Util.getBitWidth(column.rLevelMax), disableEnvelope: false // column: opts.column }); } else { rLevels.fill(0); } /* read definition levels */ const dLevelEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.definition_level_encoding); // tslint:disable-next-line:prefer-array-literal let dLevels = new Array(valueCount); if (column.dLevelMax > 0) { dLevels = decodeValues(PARQUET_RDLVL_TYPE, dLevelEncoding, dataCursor, valueCount, { bitWidth: Util.getBitWidth(column.dLevelMax), disableEnvelope: false // column: opts.column }); } else { dLevels.fill(0); } let valueCountNonNull = 0; for (const dlvl of dLevels) { if (dlvl === column.dLevelMax) { valueCountNonNull++; } } /* read values */ const valueEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.encoding); const values = decodeValues(column.primitiveType, valueEncoding, dataCursor, valueCountNonNull, { typeLength: column.typeLength, bitWidth: column.typeLength }); // info.valBuf = uncursor.buffer.toJSON(); // info.values = values; // Fs.writeFileSync(`dump/${info.path}.ts.json`, JSON.stringify(info, null, 2)); return { dlevels: dLevels, rlevels: rLevels, values, count: valueCount }; } function decodeDataPageV2(cursor, header, column, compression) { const cursorEnd = cursor.offset + header.compressed_page_size; const valueCount = header.data_page_header_v2.num_values; const valueCountNonNull = valueCount - header.data_page_header_v2.num_nulls; const valueEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header_v2.encoding); /* read repetition levels */ // tslint:disable-next-line:prefer-array-literal let rLevels = new Array(valueCount); if (column.rLevelMax > 0) { rLevels = decodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, cursor, valueCount, { bitWidth: Util.getBitWidth(column.rLevelMax), disableEnvelope: true }); } else { rLevels.fill(0); } /* read definition levels */ // tslint:disable-next-line:prefer-array-literal let dLevels = new Array(valueCount); if (column.dLevelMax > 0) { dLevels = decodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, cursor, valueCount, { bitWidth: Util.getBitWidth(column.dLevelMax), disableEnvelope: true }); } else { dLevels.fill(0); } /* read values */ let valuesBufCursor = cursor; if (header.data_page_header_v2.is_compressed) { const valuesBuf = Compression.inflate(compression, cursor.buffer.slice(cursor.offset, cursorEnd), header.uncompressed_page_size); valuesBufCursor = { buffer: valuesBuf, offset: 0, size: valuesBuf.length }; cursor.offset = cursorEnd; } const values = decodeValues(column.primitiveType, valueEncoding, valuesBufCursor, valueCountNonNull, { typeLength: column.typeLength, bitWidth: column.typeLength }); return { dlevels: dLevels, rlevels: rLevels, values, count: valueCount }; } function decodeSchema(schemaElements, offset, len) { const schema = {}; let next = offset; for (let i = 0; i < len; i++) { const schemaElement = schemaElements[next]; const repetitionType = next > 0 ? Util.getThriftEnum(thrift_1.FieldRepetitionType, schemaElement.repetition_type) : 'ROOT'; let optional = false; let repeated = false; switch (repetitionType) { case 'REQUIRED': break; case 'OPTIONAL': optional = true; break; case 'REPEATED': repeated = true; break; } if (schemaElement.num_children > 0) { const res = decodeSchema(schemaElements, next + 1, schemaElement.num_children); next = res.next; schema[schemaElement.name] = { // type: undefined, optional, repeated, fields: res.schema }; } else { let logicalType = Util.getThriftEnum(thrift_1.Type, schemaElement.type); if (schemaElement.converted_type != null) { logicalType = Util.getThriftEnum(thrift_1.ConvertedType, schemaElement.converted_type); } schema[schemaElement.name] = { type: logicalType, typeLength: schemaElement.type_length, optional, repeated }; next++; } } return { schema, offset, next }; } //# sourceMappingURL=reader.js.map