/* jshint node: true */ // TODO: Add minimal templating. // TODO: Add option to prefix nested type declarations with the outer types' // names. 'use strict'; /** IDL to protocol (services) and schema (types) parsing logic. */ var files = require('./files'), utils = require('./utils'), path = require('path'), util = require('util'); var f = util.format; // Default type references defined by Avro. var TYPE_REFS = { date: {type: 'int', logicalType: 'date'}, decimal: {type: 'bytes', logicalType: 'decimal'}, time_ms: {type: 'long', logicalType: 'time-millis'}, timestamp_ms: {type: 'long', logicalType: 'timestamp-millis'} }; /** Assemble an IDL file into a decoded protocol. */ function assembleProtocol(fpath, opts, cb) { if (!cb && typeof opts == 'function') { cb = opts; opts = undefined; } opts = opts || {}; if (!opts.importHook) { opts.importHook = files.createImportHook(); } importFile(fpath, function (err, protocol) { if (err) { cb(err); return; } if (!protocol) { cb(new Error('empty root import')); return; } var schemas = protocol.types; if (schemas) { // Strip redundant namespaces from types before returning the protocol. // Note that we keep empty (`''`) nested namespaces when the outer one is // non-empty. This allows figuring out whether unqualified imported names // should be qualified by the protocol's namespace: they should if their // namespace is `undefined` and should not if it is empty. var namespace = protocolNamespace(protocol) || ''; schemas.forEach(function (schema) { if (schema.namespace === namespace) { delete schema.namespace; } }); } cb(null, protocol); }); function importFile(fpath, cb) { opts.importHook(fpath, 'idl', function (err, str) { if (err) { cb(err); return; } if (str === undefined) { // This signals an already imported file by the default import hooks. // Implementors who wish to disallow duplicate imports should provide a // custom hook which throws an error when a duplicate is detected. cb(); return; } try { var reader = new Reader(str, opts); var obj = reader._readProtocol(str, opts); } catch (err) { err.path = fpath; // To help debug which file caused the error. cb(err); return; } fetchImports(obj.protocol, obj.imports, cb); }); } function fetchImports(protocol, imports, cb) { var importedProtocols = []; next(); function next() { var info = imports.shift(); if (!info) { // We are done with this file. We prepend all imported types to this // file's and we can return the final result. importedProtocols.reverse(); try { importedProtocols.forEach(function (imported) { mergeImport(protocol, imported); }); } catch (err) { cb(err); return; } cb(null, protocol); return; } var importPath = path.join(path.dirname(fpath), info.name); if (info.kind === 'idl') { importFile(importPath, function (err, imported) { if (err) { cb(err); return; } if (imported) { importedProtocols.push(imported); } next(); }); } else { // We are importing a protocol or schema file. opts.importHook(importPath, info.kind, function (err, str) { if (err) { cb(err); return; } switch (info.kind) { case 'protocol': case 'schema': if (str === undefined) { // Skip duplicate import (see related comment above). next(); return; } try { var obj = JSON.parse(str); } catch (err) { err.path = importPath; cb(err); return; } var imported = info.kind === 'schema' ? {types: [obj]} : obj; importedProtocols.push(imported); next(); return; default: cb(new Error(f('invalid import kind: %s', info.kind))); } }); } } } function mergeImport(protocol, imported) { // Merge first the types (where we don't need to check for duplicates // since instantiating the service will take care of it), then the messages // (where we need to, as duplicates will overwrite each other). var schemas = imported.types || []; schemas.reverse(); schemas.forEach(function (schema) { if (!protocol.types) { protocol.types = []; } // Ensure the imported protocol's namespace is inherited correctly (it // might be different from the current one). if (schema.namespace === undefined) { schema.namespace = protocolNamespace(imported) || ''; } protocol.types.unshift(schema); }); Object.keys(imported.messages || {}).forEach(function (name) { if (!protocol.messages) { protocol.messages = {}; } if (protocol.messages[name]) { throw new Error(f('duplicate message: %s', name)); } protocol.messages[name] = imported.messages[name]; }); } } // Parsing functions. /** * Convenience function to parse multiple inputs into protocols and schemas. * * It should cover most basic use-cases but has a few limitations: * * + It doesn't allow passing options to the parsing step. * + The protocol/type inference logic can be deceived. * * The parsing logic is as follows: * * + If `str` contains `path.sep` (on windows `\`, otherwise `/`) and is a path * to an existing file, it will first be read as JSON, then as an IDL * specification if JSON parsing failed. If either succeeds, the result is * returned, otherwise the next steps are run using the file's content * instead of the input path. * + If `str` is a valid JSON string, it is parsed then returned. * + If `str` is a valid IDL protocol specification, it is parsed and returned * if no imports are present (and an error is thrown if there are any * imports). * + If `str` is a valid IDL type specification, it is parsed and returned. * + If neither of the above cases apply, `str` is returned. */ function read(str) { var schema; if (typeof str == 'string' && ~str.indexOf(path.sep) && files.existsSync(str)) { // Try interpreting `str` as path to a file contain a JSON schema or an IDL // protocol. Note that we add the second check to skip primitive references // (e.g. `"int"`, the most common use-case for `avro.parse`). var contents = files.readFileSync(str, {encoding: 'utf8'}); try { return JSON.parse(contents); } catch (err) { var opts = {importHook: files.createSyncImportHook()}; assembleProtocol(str, opts, function (err, protocolSchema) { schema = err ? contents : protocolSchema; }); } } else { schema = str; } if (typeof schema != 'string' || schema === 'null') { // This last predicate is to allow `read('null')` to work similarly to // `read('int')` and other primitives (null needs to be handled separately // since it is also a valid JSON identifier). return schema; } try { return JSON.parse(schema); } catch (err) { try { return Reader.readProtocol(schema); } catch (err) { try { return Reader.readSchema(schema); } catch (err) { return schema; } } } } function Reader(str, opts) { opts = opts || {}; this._tk = new Tokenizer(str); this._ackVoidMessages = !!opts.ackVoidMessages; this._implicitTags = !opts.delimitedCollections; this._typeRefs = opts.typeRefs || TYPE_REFS; } Reader.readProtocol = function (str, opts) { var reader = new Reader(str, opts); var protocol = reader._readProtocol(); if (protocol.imports.length) { // Imports can only be resolved when the IDL file is provided via its // path, we fail rather than silently ignore imports. throw new Error('unresolvable import'); } return protocol.protocol; }; Reader.readSchema = function (str, opts) { var reader = new Reader(str, opts); var doc = reader._readJavadoc(); var schema = reader._readType(doc === undefined ? {} : {doc: doc}, true); reader._tk.next({id: '(eof)'}); // Check that we have read everything. return schema; }; Reader.prototype._readProtocol = function () { var tk = this._tk; var imports = []; var types = []; var messages = {}; var pos; // Outer declarations (outside of the protocol block). this._readImports(imports); var protocolSchema = {}; var protocolJavadoc = this._readJavadoc(); if (protocolJavadoc !== undefined) { protocolSchema.doc = protocolJavadoc; } this._readAnnotations(protocolSchema); tk.next({val: 'protocol'}); if (!tk.next({val: '{', silent: true})) { // Named protocol. protocolSchema.protocol = tk.next({id: 'name'}).val; tk.next({val: '{'}); } // Inner declarations. while (!tk.next({val: '}', silent: true})) { if (!this._readImports(imports)) { var javadoc = this._readJavadoc(); var typeSchema = this._readType({}, true); var numImports = this._readImports(imports, true); var message = undefined; // We mark our position and try to parse a message from here. pos = tk.pos; if (!numImports && (message = this._readMessage(typeSchema))) { // Note that if any imports were found, we cannot be parsing a message. if (javadoc !== undefined && message.schema.doc === undefined) { message.schema.doc = javadoc; } var oneWay = false; if ( message.schema.response === 'void' || message.schema.response.type === 'void' ) { oneWay = !this._ackVoidMessages && !message.schema.errors; if (message.schema.response === 'void') { message.schema.response = 'null'; } else { message.schema.response.type = 'null'; } } if (oneWay) { message.schema['one-way'] = true; } if (messages[message.name]) { // We have to do this check here otherwise the duplicate will be // overwritten (and service instantiation won't be able to catch it). throw new Error(f('duplicate message: %s', message.name)); } messages[message.name] = message.schema; } else { // This was a standalone type definition. if (javadoc) { if (typeof typeSchema == 'string') { typeSchema = {doc: javadoc, type: typeSchema}; } else if (typeSchema.doc === undefined) { typeSchema.doc = javadoc; } } types.push(typeSchema); // We backtrack until just before the type's type name and swallow an // eventual semi-colon (to make type declarations more consistent). tk.pos = pos; tk.next({val: ';', silent: true}); } javadoc = undefined; } } tk.next({id: '(eof)'}); if (types.length) { protocolSchema.types = types; } if (Object.keys(messages).length) { protocolSchema.messages = messages; } return {protocol: protocolSchema, imports: imports}; }; Reader.prototype._readAnnotations = function (schema) { var tk = this._tk; while (tk.next({val: '@', silent: true})) { // Annotations are allowed to have names which aren't valid Avro names, // we must advance until we hit the first left parenthesis. var parts = []; while (!tk.next({val: '(', silent: true})) { parts.push(tk.next().val); } schema[parts.join('')] = tk.next({id: 'json'}).val; tk.next({val: ')'}); } }; Reader.prototype._readMessage = function (responseSchema) { var tk = this._tk; var schema = {request: [], response: responseSchema}; this._readAnnotations(schema); var name = tk.next().val; if (tk.next().val !== '(') { // This isn't a message. return; } if (!tk.next({val: ')', silent: true})) { do { schema.request.push(this._readField()); } while (!tk.next({val: ')', silent: true}) && tk.next({val: ','})); } var token = tk.next(); switch (token.val) { case 'throws': // It doesn't seem like the IDL is explicit about which syntax to used // for multiple errors. We will assume a comma-separated list. schema.errors = []; do { schema.errors.push(this._readType()); } while (!tk.next({val: ';', silent: true}) && tk.next({val: ','})); break; case 'oneway': schema['one-way'] = true; tk.next({val: ';'}); break; case ';': break; default: throw tk.error('invalid message suffix', token); } return {name: name, schema: schema}; }; Reader.prototype._readJavadoc = function () { var token = this._tk.next({id: 'javadoc', emitJavadoc: true, silent: true}); if (token) { return token.val; } }; Reader.prototype._readField = function () { var tk = this._tk; var javadoc = this._readJavadoc(); var schema = {type: this._readType()}; if (javadoc !== undefined && schema.doc === undefined) { schema.doc = javadoc; } this._readAnnotations(schema); schema.name = tk.next({id: 'name'}).val; if (tk.next({val: '=', silent: true})) { schema['default'] = tk.next({id: 'json'}).val; } return schema; }; Reader.prototype._readType = function (schema, top) { schema = schema || {}; this._readAnnotations(schema); schema.type = this._tk.next({id: 'name'}).val; switch (schema.type) { case 'record': case 'error': return this._readRecord(schema); case 'fixed': return this._readFixed(schema); case 'enum': return this._readEnum(schema, top); case 'map': return this._readMap(schema); case 'array': return this._readArray(schema); case 'union': if (Object.keys(schema).length > 1) { throw new Error('union annotations are not supported'); } return this._readUnion(); default: // Reference. var ref = this._typeRefs[schema.type]; if (ref) { delete schema.type; // Always overwrite the type. utils.copyOwnProperties(ref, schema); } return Object.keys(schema).length > 1 ? schema : schema.type; } }; Reader.prototype._readFixed = function (schema) { var tk = this._tk; if (!tk.next({val: '(', silent: true})) { schema.name = tk.next({id: 'name'}).val; tk.next({val: '('}); } schema.size = parseInt(tk.next({id: 'number'}).val); tk.next({val: ')'}); return schema; }; Reader.prototype._readMap = function (schema) { var tk = this._tk; // Brackets are unwieldy when declaring inline types. We allow for them to be // omitted (but we keep the consistency that if the entry bracket is present, // the exit one must be as well). Note that this is non-standard. var silent = this._implicitTags; var implicitTags = tk.next({val: '<', silent: silent}) === undefined; schema.values = this._readType(); tk.next({val: '>', silent: implicitTags}); return schema; }; Reader.prototype._readArray = function (schema) { var tk = this._tk; var silent = this._implicitTags; var implicitTags = tk.next({val: '<', silent: silent}) === undefined; schema.items = this._readType(); tk.next({val: '>', silent: implicitTags}); return schema; }; Reader.prototype._readEnum = function (schema, top) { var tk = this._tk; if (!tk.next({val: '{', silent: true})) { schema.name = tk.next({id: 'name'}).val; tk.next({val: '{'}); } schema.symbols = []; do { schema.symbols.push(tk.next().val); } while (!tk.next({val: '}', silent: true}) && tk.next({val: ','})); // To avoid confusing syntax, reader enums (i.e. enums with a default value) // can only be defined top-level. if (top && tk.next({val: '=', silent: true})) { schema.default = tk.next().val; tk.next({val: ';'}); } return schema; }; Reader.prototype._readUnion = function () { var tk = this._tk; var arr = []; tk.next({val: '{'}); do { arr.push(this._readType()); } while (!tk.next({val: '}', silent: true}) && tk.next({val: ','})); return arr; }; Reader.prototype._readRecord = function (schema) { var tk = this._tk; if (!tk.next({val: '{', silent: true})) { schema.name = tk.next({id: 'name'}).val; tk.next({val: '{'}); } schema.fields = []; while (!tk.next({val: '}', silent: true})) { schema.fields.push(this._readField()); tk.next({val: ';'}); } return schema; }; Reader.prototype._readImports = function (imports, maybeMessage) { var tk = this._tk; var numImports = 0; var pos = tk.pos; while (tk.next({val: 'import', silent: true})) { if (!numImports && maybeMessage && tk.next({val: '(', silent: true})) { // This will happen if a message is named import. tk.pos = pos; return; } var kind = tk.next({id: 'name'}).val; var fname = JSON.parse(tk.next({id: 'string'}).val); tk.next({val: ';'}); imports.push({kind: kind, name: fname}); numImports++; } return numImports; }; // Helpers. /** * Simple class to split an input string into tokens. * * There are different types of tokens, characterized by their `id`: * * + `number` numbers. * + `name` references. * + `string` double-quoted. * + `operator`, anything else, always single character. * + `javadoc`, only emitted when `next` is called with `emitJavadoc` set. * + `json`, only emitted when `next` is called with `'json'` as `id` (the * tokenizer doesn't have enough context to predict these). */ function Tokenizer(str) { this._str = str; this.pos = 0; } Tokenizer.prototype.next = function (opts) { var token = {pos: this.pos, id: undefined, val: undefined}; var javadoc = this._skip(opts && opts.emitJavadoc); if (javadoc) { token.id = 'javadoc'; token.val = javadoc; } else { var pos = this.pos; var str = this._str; var c = str.charAt(pos); if (!c) { token.id = '(eof)'; } else { if (opts && opts.id === 'json') { token.id = 'json'; this.pos = this._endOfJson(); } else if (c === '"') { token.id = 'string'; this.pos = this._endOfString(); } else if (/[0-9]/.test(c)) { token.id = 'number'; this.pos = this._endOf(/[0-9]/); } else if (/[`A-Za-z_.]/.test(c)) { token.id = 'name'; this.pos = this._endOf(/[`A-Za-z0-9_.]/); } else { token.id = 'operator'; this.pos = pos + 1; } token.val = str.slice(pos, this.pos); if (token.id === 'json') { // Let's be nice and give a more helpful error message when this occurs // (JSON parsing errors wouldn't let us find the location otherwise). try { token.val = JSON.parse(token.val); } catch (err) { throw this.error('invalid JSON', token); } } else if (token.id === 'name') { // Unescape names (our parser doesn't need them). token.val = token.val.replace(/`/g, ''); } } } var err; if (opts && opts.id && opts.id !== token.id) { err = this.error(f('expected ID %s', opts.id), token); } else if (opts && opts.val && opts.val !== token.val) { err = this.error(f('expected value %s', opts.val), token); } if (!err) { return token; } else if (opts && opts.silent) { this.pos = token.pos; // Backtrack to start of token. return undefined; } else { throw err; } }; Tokenizer.prototype.error = function (reason, context) { // Context must be either a token or a position. var isToken = typeof context != 'number'; var pos = isToken ? context.pos : context; var str = this._str; var lineNum = 1; var lineStart = 0; var i; for (i = 0; i < pos; i++) { if (str.charAt(i) === '\n') { lineNum++; lineStart = i; } } var msg = isToken ? f('invalid token %j: %s', context, reason) : reason; var err = new Error(msg); err.token = isToken ? context : undefined; err.lineNum = lineNum; err.colNum = pos - lineStart; return err; }; /** Skip whitespace and comments. */ Tokenizer.prototype._skip = function (emitJavadoc) { var str = this._str; var isJavadoc = false; var pos, c; while ((c = str.charAt(this.pos)) && /\s/.test(c)) { this.pos++; } pos = this.pos; if (c === '/') { switch (str.charAt(this.pos + 1)) { case '/': this.pos += 2; while ((c = str.charAt(this.pos)) && c !== '\n') { this.pos++; } return this._skip(emitJavadoc); case '*': this.pos += 2; if (str.charAt(this.pos) === '*') { isJavadoc = true; } while ((c = str.charAt(this.pos++))) { if (c === '*' && str.charAt(this.pos) === '/') { this.pos++; if (isJavadoc && emitJavadoc) { return extractJavadoc(str.slice(pos + 3, this.pos - 2)); } return this._skip(emitJavadoc); } } throw this.error('unterminated comment', pos); } } }; /** Generic end of method. */ Tokenizer.prototype._endOf = function (pat) { var pos = this.pos; var str = this._str; while (pat.test(str.charAt(pos))) { pos++; } return pos; }; /** Find end of a string. */ Tokenizer.prototype._endOfString = function () { var pos = this.pos + 1; // Skip first double quote. var str = this._str; var c; while ((c = str.charAt(pos))) { if (c === '"') { // The spec doesn't explicitly say so, but IDLs likely only // allow double quotes for strings (C- and Java-style). return pos + 1; } if (c === '\\') { pos += 2; } else { pos++; } } throw this.error('unterminated string', pos - 1); }; /** Find end of JSON object, throwing an error if the end is reached first. */ Tokenizer.prototype._endOfJson = function () { var pos = utils.jsonEnd(this._str, this.pos); if (pos < 0) { throw this.error('invalid JSON', pos); } return pos; }; /** * Extract Javadoc contents from the comment. * * The parsing done is very simple and simply removes the line prefixes and * leading / trailing empty lines. It's better to be conservative with * formatting rather than risk losing information. */ function extractJavadoc(str) { var lines = str .replace(/^[ \t]+|[ \t]+$/g, '') // Trim whitespace. .split('\n').map(function (line, i) { return i ? line.replace(/^\s*\*\s?/, '') : line; }); while (!lines[0]) { lines.shift(); } while (!lines[lines.length - 1]) { lines.pop(); } return lines.join('\n'); } /** Returns the namespace generated by a protocol. */ function protocolNamespace(protocol) { if (protocol.namespace) { return protocol.namespace; } var match = /^(.*)\.[^.]+$/.exec(protocol.protocol); return match ? match[1] : undefined; } module.exports = { Tokenizer: Tokenizer, assembleProtocol: assembleProtocol, read: read, readProtocol: Reader.readProtocol, readSchema: Reader.readSchema };