461 lines
16 KiB
JavaScript
461 lines
16 KiB
JavaScript
"use strict";
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
const stream_1 = require("stream");
|
|
const codec_1 = require("./codec");
|
|
const Compression = require("./compression");
|
|
const Shred = require("./shred");
|
|
// tslint:disable-next-line:max-line-length
|
|
const thrift_1 = require("./thrift");
|
|
const Util = require("./util");
|
|
const Int64 = require("node-int64");
|
|
/**
|
|
* Parquet File Magic String
|
|
*/
|
|
const PARQUET_MAGIC = 'PAR1';
|
|
/**
|
|
* Parquet File Format Version
|
|
*/
|
|
const PARQUET_VERSION = 1;
|
|
/**
|
|
* Default Page and Row Group sizes
|
|
*/
|
|
const PARQUET_DEFAULT_PAGE_SIZE = 8192;
|
|
const PARQUET_DEFAULT_ROW_GROUP_SIZE = 4096;
|
|
/**
|
|
* Repetition and Definition Level Encoding
|
|
*/
|
|
const PARQUET_RDLVL_TYPE = 'INT32';
|
|
const PARQUET_RDLVL_ENCODING = 'RLE';
|
|
/**
|
|
* Write a parquet file to an output stream. The ParquetWriter will perform
|
|
* buffering/batching for performance, so close() must be called after all rows
|
|
* are written.
|
|
*/
|
|
class ParquetWriter {
|
|
/**
|
|
* Create a new buffered parquet writer for a given envelope writer
|
|
*/
|
|
constructor(schema, envelopeWriter, opts) {
|
|
this.schema = schema;
|
|
this.envelopeWriter = envelopeWriter;
|
|
this.rowBuffer = {};
|
|
this.rowGroupSize = opts.rowGroupSize || PARQUET_DEFAULT_ROW_GROUP_SIZE;
|
|
this.closed = false;
|
|
this.userMetadata = {};
|
|
try {
|
|
envelopeWriter.writeHeader();
|
|
}
|
|
catch (err) {
|
|
envelopeWriter.close();
|
|
throw err;
|
|
}
|
|
}
|
|
/**
|
|
* Convenience method to create a new buffered parquet writer that writes to
|
|
* the specified file
|
|
*/
|
|
static async openFile(schema, path, opts) {
|
|
const outputStream = await Util.osopen(path, opts);
|
|
return ParquetWriter.openStream(schema, outputStream, opts);
|
|
}
|
|
/**
|
|
* Convenience method to create a new buffered parquet writer that writes to
|
|
* the specified stream
|
|
*/
|
|
static async openStream(schema, outputStream, opts) {
|
|
if (!opts) {
|
|
// tslint:disable-next-line:no-parameter-reassignment
|
|
opts = {};
|
|
}
|
|
const envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts);
|
|
return new ParquetWriter(schema, envelopeWriter, opts);
|
|
}
|
|
/**
|
|
* Append a single row to the parquet file. Rows are buffered in memory until
|
|
* rowGroupSize rows are in the buffer or close() is called
|
|
*/
|
|
async appendRow(row) {
|
|
if (this.closed) {
|
|
throw new Error('writer was closed');
|
|
}
|
|
Shred.shredRecord(this.schema, row, this.rowBuffer);
|
|
if (this.rowBuffer.rowCount >= this.rowGroupSize) {
|
|
await this.envelopeWriter.writeRowGroup(this.rowBuffer);
|
|
this.rowBuffer = {};
|
|
}
|
|
}
|
|
/**
|
|
* Finish writing the parquet file and commit the footer to disk. This method
|
|
* MUST be called after you are finished adding rows. You must not call this
|
|
* method twice on the same object or add any rows after the close() method has
|
|
* been called
|
|
*/
|
|
async close(callback) {
|
|
if (this.closed) {
|
|
throw new Error('writer was closed');
|
|
}
|
|
this.closed = true;
|
|
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) {
|
|
await this.envelopeWriter.writeRowGroup(this.rowBuffer);
|
|
this.rowBuffer = {};
|
|
}
|
|
await this.envelopeWriter.writeFooter(this.userMetadata);
|
|
await this.envelopeWriter.close();
|
|
this.envelopeWriter = null;
|
|
if (callback) {
|
|
callback();
|
|
}
|
|
}
|
|
/**
|
|
* Add key<>value metadata to the file
|
|
*/
|
|
setMetadata(key, value) {
|
|
// TODO: value to be any, obj -> JSON
|
|
this.userMetadata[String(key)] = String(value);
|
|
}
|
|
/**
|
|
* Set the parquet row group size. This values controls the maximum number
|
|
* of rows that are buffered in memory at any given time as well as the number
|
|
* of rows that are co-located on disk. A higher value is generally better for
|
|
* read-time I/O performance at the tradeoff of write-time memory usage.
|
|
*/
|
|
setRowGroupSize(cnt) {
|
|
this.rowGroupSize = cnt;
|
|
}
|
|
/**
|
|
* Set the parquet data page size. The data page size controls the maximum
|
|
* number of column values that are written to disk as a consecutive array
|
|
*/
|
|
setPageSize(cnt) {
|
|
this.envelopeWriter.setPageSize(cnt);
|
|
}
|
|
}
|
|
exports.ParquetWriter = ParquetWriter;
|
|
/**
|
|
* Create a parquet file from a schema and a number of row groups. This class
|
|
* performs direct, unbuffered writes to the underlying output stream and is
|
|
* intendend for advanced and internal users; the writeXXX methods must be
|
|
* called in the correct order to produce a valid file.
|
|
*/
|
|
class ParquetEnvelopeWriter {
|
|
/**
|
|
* Create a new parquet envelope writer that writes to the specified stream
|
|
*/
|
|
static async openStream(schema, outputStream, opts) {
|
|
const writeFn = Util.oswrite.bind(undefined, outputStream);
|
|
const closeFn = Util.osclose.bind(undefined, outputStream);
|
|
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);
|
|
}
|
|
constructor(schema, writeFn, closeFn, fileOffset, opts) {
|
|
this.schema = schema;
|
|
this.write = writeFn;
|
|
this.close = closeFn;
|
|
this.offset = fileOffset;
|
|
this.rowCount = 0;
|
|
this.rowGroups = [];
|
|
this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE;
|
|
this.useDataPageV2 = ('useDataPageV2' in opts) ? opts.useDataPageV2 : false;
|
|
}
|
|
writeSection(buf) {
|
|
this.offset += buf.length;
|
|
return this.write(buf);
|
|
}
|
|
/**
|
|
* Encode the parquet file header
|
|
*/
|
|
writeHeader() {
|
|
return this.writeSection(Buffer.from(PARQUET_MAGIC));
|
|
}
|
|
/**
|
|
* Encode a parquet row group. The records object should be created using the
|
|
* shredRecord method
|
|
*/
|
|
writeRowGroup(records) {
|
|
const rgroup = encodeRowGroup(this.schema, records, {
|
|
baseOffset: this.offset,
|
|
pageSize: this.pageSize,
|
|
useDataPageV2: this.useDataPageV2
|
|
});
|
|
this.rowCount += records.rowCount;
|
|
this.rowGroups.push(rgroup.metadata);
|
|
return this.writeSection(rgroup.body);
|
|
}
|
|
/**
|
|
* Write the parquet file footer
|
|
*/
|
|
writeFooter(userMetadata) {
|
|
if (!userMetadata) {
|
|
// tslint:disable-next-line:no-parameter-reassignment
|
|
userMetadata = {};
|
|
}
|
|
return this.writeSection(encodeFooter(this.schema, this.rowCount, this.rowGroups, userMetadata));
|
|
}
|
|
/**
|
|
* Set the parquet data page size. The data page size controls the maximum
|
|
* number of column values that are written to disk as a consecutive array
|
|
*/
|
|
setPageSize(cnt) {
|
|
this.pageSize = cnt;
|
|
}
|
|
}
|
|
exports.ParquetEnvelopeWriter = ParquetEnvelopeWriter;
|
|
/**
|
|
* Create a parquet transform stream
|
|
*/
|
|
class ParquetTransformer extends stream_1.Transform {
|
|
constructor(schema, opts = {}) {
|
|
super({ objectMode: true });
|
|
const writeProxy = (function (t) {
|
|
return function (b) {
|
|
t.push(b);
|
|
};
|
|
})(this);
|
|
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, () => ({}), 0, opts), opts);
|
|
}
|
|
// tslint:disable-next-line:function-name
|
|
_transform(row, encoding, callback) {
|
|
if (row) {
|
|
this.writer.appendRow(row).then(callback);
|
|
}
|
|
else {
|
|
callback();
|
|
}
|
|
}
|
|
// tslint:disable-next-line:function-name
|
|
_flush(callback) {
|
|
this.writer.close(callback);
|
|
}
|
|
}
|
|
exports.ParquetTransformer = ParquetTransformer;
|
|
/**
|
|
* Encode a consecutive array of data using one of the parquet encodings
|
|
*/
|
|
function encodeValues(type, encoding, values, opts) {
|
|
if (!(encoding in codec_1.PARQUET_CODEC)) {
|
|
throw new Error(`invalid encoding: ${encoding}`);
|
|
}
|
|
return codec_1.PARQUET_CODEC[encoding].encodeValues(type, values, opts);
|
|
}
|
|
/**
|
|
* Encode a parquet data page
|
|
*/
|
|
function encodeDataPage(column, data) {
|
|
/* encode repetition and definition levels */
|
|
let rLevelsBuf = Buffer.alloc(0);
|
|
if (column.rLevelMax > 0) {
|
|
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
bitWidth: Util.getBitWidth(column.rLevelMax)
|
|
// disableEnvelope: false
|
|
});
|
|
}
|
|
let dLevelsBuf = Buffer.alloc(0);
|
|
if (column.dLevelMax > 0) {
|
|
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
|
|
bitWidth: Util.getBitWidth(column.dLevelMax)
|
|
// disableEnvelope: false
|
|
});
|
|
}
|
|
/* encode values */
|
|
const valuesBuf = encodeValues(column.primitiveType, column.encoding, data.values, { typeLength: column.typeLength, bitWidth: column.typeLength });
|
|
const dataBuf = Buffer.concat([
|
|
rLevelsBuf,
|
|
dLevelsBuf,
|
|
valuesBuf
|
|
]);
|
|
// compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
|
|
const compressedBuf = Compression.deflate(column.compression, dataBuf);
|
|
/* build page header */
|
|
const header = new thrift_1.PageHeader({
|
|
type: thrift_1.PageType.DATA_PAGE,
|
|
data_page_header: new thrift_1.DataPageHeader({
|
|
num_values: data.count,
|
|
encoding: thrift_1.Encoding[column.encoding],
|
|
definition_level_encoding: thrift_1.Encoding[PARQUET_RDLVL_ENCODING],
|
|
repetition_level_encoding: thrift_1.Encoding[PARQUET_RDLVL_ENCODING],
|
|
}),
|
|
uncompressed_page_size: dataBuf.length,
|
|
compressed_page_size: compressedBuf.length
|
|
});
|
|
/* concat page header, repetition and definition levels and values */
|
|
const headerBuf = Util.serializeThrift(header);
|
|
const page = Buffer.concat([
|
|
headerBuf,
|
|
compressedBuf
|
|
]);
|
|
return { header, headerSize: headerBuf.length, page };
|
|
}
|
|
/**
|
|
* Encode a parquet data page (v2)
|
|
*/
|
|
function encodeDataPageV2(column, data, rowCount) {
|
|
/* encode values */
|
|
const valuesBuf = encodeValues(column.primitiveType, column.encoding, data.values, {
|
|
typeLength: column.typeLength,
|
|
bitWidth: column.typeLength
|
|
});
|
|
// compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
|
|
const compressedBuf = Compression.deflate(column.compression, valuesBuf);
|
|
/* encode repetition and definition levels */
|
|
let rLevelsBuf = Buffer.alloc(0);
|
|
if (column.rLevelMax > 0) {
|
|
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.rlevels, {
|
|
bitWidth: Util.getBitWidth(column.rLevelMax),
|
|
disableEnvelope: true
|
|
});
|
|
}
|
|
let dLevelsBuf = Buffer.alloc(0);
|
|
if (column.dLevelMax > 0) {
|
|
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, data.dlevels, {
|
|
bitWidth: Util.getBitWidth(column.dLevelMax),
|
|
disableEnvelope: true
|
|
});
|
|
}
|
|
/* build page header */
|
|
const header = new thrift_1.PageHeader({
|
|
type: thrift_1.PageType.DATA_PAGE_V2,
|
|
data_page_header_v2: new thrift_1.DataPageHeaderV2({
|
|
num_values: data.count,
|
|
num_nulls: data.count - data.values.length,
|
|
num_rows: rowCount,
|
|
encoding: thrift_1.Encoding[column.encoding],
|
|
definition_levels_byte_length: dLevelsBuf.length,
|
|
repetition_levels_byte_length: rLevelsBuf.length,
|
|
is_compressed: column.compression !== 'UNCOMPRESSED'
|
|
}),
|
|
uncompressed_page_size: rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length,
|
|
compressed_page_size: rLevelsBuf.length + dLevelsBuf.length + compressedBuf.length
|
|
});
|
|
/* concat page header, repetition and definition levels and values */
|
|
const headerBuf = Util.serializeThrift(header);
|
|
const page = Buffer.concat([
|
|
headerBuf,
|
|
rLevelsBuf,
|
|
dLevelsBuf,
|
|
compressedBuf
|
|
]);
|
|
return { header, headerSize: headerBuf.length, page };
|
|
}
|
|
/**
|
|
* Encode an array of values into a parquet column chunk
|
|
*/
|
|
function encodeColumnChunk(column, buffer, offset, opts) {
|
|
const data = buffer.columnData[column.path.join()];
|
|
const baseOffset = (opts.baseOffset || 0) + offset;
|
|
/* encode data page(s) */
|
|
// const pages: Buffer[] = [];
|
|
let pageBuf;
|
|
// tslint:disable-next-line:variable-name
|
|
let total_uncompressed_size = 0;
|
|
// tslint:disable-next-line:variable-name
|
|
let total_compressed_size = 0;
|
|
{
|
|
let result;
|
|
if (opts.useDataPageV2) {
|
|
result = encodeDataPageV2(column, data, buffer.rowCount);
|
|
}
|
|
else {
|
|
result = encodeDataPage(column, data);
|
|
}
|
|
// pages.push(result.page);
|
|
pageBuf = result.page;
|
|
total_uncompressed_size += result.header.uncompressed_page_size + result.headerSize;
|
|
total_compressed_size += result.header.compressed_page_size + result.headerSize;
|
|
}
|
|
// const pagesBuf = Buffer.concat(pages);
|
|
// const compression = column.compression === 'UNCOMPRESSED' ? (opts.compression || 'UNCOMPRESSED') : column.compression;
|
|
/* prepare metadata header */
|
|
const metadata = new thrift_1.ColumnMetaData({
|
|
path_in_schema: column.path,
|
|
num_values: data.count,
|
|
data_page_offset: baseOffset,
|
|
encodings: [],
|
|
total_uncompressed_size,
|
|
total_compressed_size,
|
|
type: thrift_1.Type[column.primitiveType],
|
|
codec: thrift_1.CompressionCodec[column.compression]
|
|
});
|
|
/* list encodings */
|
|
metadata.encodings.push(thrift_1.Encoding[PARQUET_RDLVL_ENCODING]);
|
|
metadata.encodings.push(thrift_1.Encoding[column.encoding]);
|
|
/* concat metadata header and data pages */
|
|
const metadataOffset = baseOffset + pageBuf.length;
|
|
const body = Buffer.concat([pageBuf, Util.serializeThrift(metadata)]);
|
|
return { body, metadata, metadataOffset };
|
|
}
|
|
/**
|
|
* Encode a list of column values into a parquet row group
|
|
*/
|
|
function encodeRowGroup(schema, data, opts) {
|
|
const metadata = new thrift_1.RowGroup({
|
|
num_rows: data.rowCount,
|
|
columns: [],
|
|
total_byte_size: 0
|
|
});
|
|
let body = Buffer.alloc(0);
|
|
for (const field of schema.fieldList) {
|
|
if (field.isNested) {
|
|
continue;
|
|
}
|
|
const cchunkData = encodeColumnChunk(field, data, body.length, opts);
|
|
const cchunk = new thrift_1.ColumnChunk({
|
|
file_offset: cchunkData.metadataOffset,
|
|
meta_data: cchunkData.metadata
|
|
});
|
|
metadata.columns.push(cchunk);
|
|
metadata.total_byte_size = new Int64(+metadata.total_byte_size + cchunkData.body.length);
|
|
body = Buffer.concat([body, cchunkData.body]);
|
|
}
|
|
return { body, metadata };
|
|
}
|
|
/**
|
|
* Encode a parquet file metadata footer
|
|
*/
|
|
function encodeFooter(schema, rowCount, rowGroups, userMetadata) {
|
|
const metadata = new thrift_1.FileMetaData({
|
|
version: PARQUET_VERSION,
|
|
created_by: 'parquets',
|
|
num_rows: rowCount,
|
|
row_groups: rowGroups,
|
|
schema: [],
|
|
key_value_metadata: []
|
|
});
|
|
for (const key in userMetadata) {
|
|
const kv = new thrift_1.KeyValue({
|
|
key,
|
|
value: userMetadata[key]
|
|
});
|
|
metadata.key_value_metadata.push(kv);
|
|
}
|
|
{
|
|
const schemaRoot = new thrift_1.SchemaElement({
|
|
name: 'root',
|
|
num_children: Object.keys(schema.fields).length
|
|
});
|
|
metadata.schema.push(schemaRoot);
|
|
}
|
|
for (const field of schema.fieldList) {
|
|
const relt = thrift_1.FieldRepetitionType[field.repetitionType];
|
|
const schemaElem = new thrift_1.SchemaElement({
|
|
name: field.name,
|
|
repetition_type: relt
|
|
});
|
|
if (field.isNested) {
|
|
schemaElem.num_children = field.fieldCount;
|
|
}
|
|
else {
|
|
schemaElem.type = thrift_1.Type[field.primitiveType];
|
|
}
|
|
if (field.originalType) {
|
|
schemaElem.converted_type = thrift_1.ConvertedType[field.originalType];
|
|
}
|
|
schemaElem.type_length = field.typeLength;
|
|
metadata.schema.push(schemaElem);
|
|
}
|
|
const metadataEncoded = Util.serializeThrift(metadata);
|
|
const footerEncoded = Buffer.alloc(metadataEncoded.length + 8);
|
|
metadataEncoded.copy(footerEncoded);
|
|
footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length);
|
|
footerEncoded.write(PARQUET_MAGIC, metadataEncoded.length + 4);
|
|
return footerEncoded;
|
|
}
|
|
//# sourceMappingURL=writer.js.map
|