UNPKG

micromark

Version:

small commonmark compliant markdown parser with positional info and concrete tokens

399 lines (326 loc) 10.6 kB
export default createTokenizer import assert from 'assert' import createDebug from 'debug' import assign from '../constant/assign.mjs' import codes from '../character/codes.mjs' import markdownLineEnding from '../character/markdown-line-ending.mjs' import chunkedPush from './chunked-push.mjs' import chunkedSplice from './chunked-splice.mjs' import miniflat from './miniflat.mjs' import resolveAll from './resolve-all.mjs' import serializeChunks from './serialize-chunks.mjs' import shallow from './shallow.mjs' import sliceChunks from './slice-chunks.mjs' var debug = createDebug('micromark') // Create a tokenizer. // Tokenizers deal with one type of data (e.g., containers, flow, text). // The parser is the object dealing with it all. // `initialize` works like other constructs, except that only its `tokenize` // function is used, in which case it doesn’t receive an `ok` or `nok`. // `from` can be given to set the point before the first character, although // when further lines are indented, they must be set with `defineSkip`. function createTokenizer(parser, initialize, from) { var point = from ? shallow(from) : {line: 1, column: 1, offset: 0} var columnStart = {} var resolveAllConstructs = [] var chunks = [] var stack = [] var consumed = true // Tools used for tokenizing. var effects = { consume: consume, enter: enter, exit: exit, attempt: constructFactory(onsuccessfulconstruct), check: constructFactory(onsuccessfulcheck), interrupt: constructFactory(onsuccessfulcheck, {interrupt: true}), lazy: constructFactory(onsuccessfulcheck, {lazy: true}) } // State and tools for resolving and serializing. var context = { previous: codes.eof, events: [], parser: parser, sliceStream: sliceStream, sliceSerialize: sliceSerialize, now: now, defineSkip: skip, write: write } // The state function. var state = initialize.tokenize.call(context, effects) // Track which character we expect to be consumed, to catch bugs. var expectedCode if (initialize.resolveAll) { resolveAllConstructs.push(initialize) } // Store where we are in the input stream. point._index = 0 point._bufferIndex = -1 return context function write(slice) { chunks = chunkedPush(chunks, slice) main() // Exit if we’re not done, resolve might change stuff. if (chunks[chunks.length - 1] !== codes.eof) { return [] } addResult(initialize, 0) // Otherwise, resolve, and exit. context.events = resolveAll(resolveAllConstructs, context.events, context) return context.events } // // Tools. // function sliceSerialize(token) { return serializeChunks(sliceStream(token)) } function sliceStream(token) { return sliceChunks(chunks, token) } function now() { return shallow(point) } function skip(value) { columnStart[value.line] = value.column accountForPotentialSkip() debug('position: define skip: `%j`', point) } // // State management. // // Main loop (note that `_index` and `_bufferIndex` in `point` are modified by // `consume`). // Here is where we walk through the chunks, which either include strings of // several characters, or numerical character codes. // The reason to do this in a loop instead of a call is so the stack can // drain. function main() { var chunkIndex var chunk while (point._index < chunks.length) { chunk = chunks[point._index] // If we’re in a buffer chunk, loop through it. if (typeof chunk === 'string') { chunkIndex = point._index if (point._bufferIndex < 0) { point._bufferIndex = 0 } while ( point._index === chunkIndex && point._bufferIndex < chunk.length ) { go(chunk.charCodeAt(point._bufferIndex)) } } else { go(chunk) } } } // Deal with one code. function go(code) { assert.equal(consumed, true, 'expected character to be consumed') consumed = undefined debug('main: passing `%s` to %s', code, state.name) expectedCode = code state = state(code) } // Move a character forward. function consume(code) { assert.equal( code, expectedCode, 'expected given code to equal expected code' ) debug('consume: `%s`', code) assert.equal(consumed, undefined, 'expected code to not have been consumed') assert( code === null ? !context.events.length || context.events[context.events.length - 1][0] === 'exit' : context.events[context.events.length - 1][0] === 'enter', 'expected last token to be open' ) if (markdownLineEnding(code)) { point.line++ point.column = 1 point.offset += code === codes.carriageReturnLineFeed ? 2 : 1 accountForPotentialSkip() debug('position: after eol: `%j`', point) } else if (code !== codes.virtualSpace) { point.column++ point.offset++ } // Not in a string chunk. if (point._bufferIndex < 0) { point._index++ } else { point._bufferIndex++ // At end of string chunk. if (point._bufferIndex === chunks[point._index].length) { point._bufferIndex = -1 point._index++ } } // Expose the previous character. context.previous = code // Mark as consumed. consumed = true } // Start a token. function enter(type, fields) { var token = fields || {} token.type = type token.start = now() assert.equal(typeof type, 'string', 'expected string type') assert.notEqual(type.length, 0, 'expected non-empty string') debug('enter: `%s`', type) context.events.push(['enter', token, context]) stack.push(token) return token } // Stop a token. function exit(type) { assert.equal(typeof type, 'string', 'expected string type') assert.notEqual(type.length, 0, 'expected non-empty string') assert.notEqual(stack.length, 0, 'cannot close w/o open tokens') var token = stack.pop() token.end = now() assert.equal(type, token.type, 'expected exit token to match current token') assert( !( token.start._index === token.end._index && token.start._bufferIndex === token.end._bufferIndex ), 'expected non-empty token (`' + type + '`)' ) debug('exit: `%s`', token.type) context.events.push(['exit', token, context]) return token } // Use results. function onsuccessfulconstruct(construct, info) { addResult(construct, info.from) } // Discard results. function onsuccessfulcheck(construct, info) { info.restore() } // Factory to attempt/check/interrupt. function constructFactory(onreturn, fields) { return hook // Handle either an object mapping codes to constructs, a list of // constructs, or a single construct. function hook(constructs, returnState, bogusState) { var listOfConstructs var constructIndex var currentConstruct var info return constructs.tokenize || 'length' in constructs ? handleListOfConstructs(miniflat(constructs)) : handleMapOfConstructs function handleMapOfConstructs(code) { if (code in constructs || codes.eof in constructs) { return handleListOfConstructs( constructs.null ? /* c8 ignore next */ miniflat(constructs[code]).concat(miniflat(constructs.null)) : constructs[code] )(code) } return bogusState(code) } function handleListOfConstructs(list) { listOfConstructs = list constructIndex = 0 return handleConstruct(list[constructIndex]) } function handleConstruct(construct) { return start function start(code) { // To do: not nede to store if there is no bogus state, probably? // Currently doesn’t work because `inspect` in document does a check // w/o a bogus, which doesn’t make sense. But it does seem to help perf // by not storing. info = store() currentConstruct = construct if (!construct.partial) { context.currentConstruct = construct } if ( construct.name && context.parser.constructs.disable.null.indexOf(construct.name) > -1 ) { return nok(code) } return construct.tokenize.call( fields ? assign({}, context, fields) : context, effects, ok, nok )(code) } } function ok(code) { assert.equal(code, expectedCode, 'expected code') consumed = true onreturn(currentConstruct, info) return returnState } function nok(code) { assert.equal(code, expectedCode, 'expected code') consumed = true info.restore() if (++constructIndex < listOfConstructs.length) { return handleConstruct(listOfConstructs[constructIndex]) } return bogusState } } } function addResult(construct, from) { if (construct.resolveAll && resolveAllConstructs.indexOf(construct) < 0) { resolveAllConstructs.push(construct) } if (construct.resolve) { chunkedSplice( context.events, from, context.events.length - from, construct.resolve(context.events.slice(from), context) ) } if (construct.resolveTo) { context.events = construct.resolveTo(context.events, context) } assert( !context.events.length || context.events[context.events.length - 1][0] === 'exit', 'expected last token to end' ) } function store() { var startPoint = now() var startPrevious = context.previous var startCurrentConstruct = context.currentConstruct var startEventsIndex = context.events.length var startStack = Array.from(stack) return {restore: restore, from: startEventsIndex} function restore() { point = startPoint context.previous = startPrevious context.currentConstruct = startCurrentConstruct context.events.length = startEventsIndex stack = startStack accountForPotentialSkip() debug('position: restore: `%j`', point) } } function accountForPotentialSkip() { if (point.line in columnStart && point.column < 2) { point.column = columnStart[point.line] point.offset += columnStart[point.line] - 1 } } }