cm-tarnation
Version:
An alternative parser for CodeMirror 6
396 lines (327 loc) • 12.9 kB
text/typescript
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
import { LanguageDescription, ParseContext } from "@codemirror/language"
import {
Input,
NestedParse,
parseMixed,
Parser as CodeMirrorParser,
ParseWrapper,
PartialParse,
SyntaxNodeRef,
Tree,
TreeCursor,
TreeFragment
} from "@lezer/common"
import { ChunkBuffer } from "./compiler/buffer"
import { Compiler } from "./compiler/compiler"
import {
DISABLED_NESTED,
LIMIT_TO_VIEWPORT,
MARGIN_AFTER,
MARGIN_BEFORE,
NodeID,
REUSE_LEFT,
REUSE_RIGHT
} from "./constants"
import type { GrammarState } from "./grammar/state"
import type { TarnationLanguage } from "./language"
import { ParseRegion } from "./region"
import type { GrammarToken } from "./types"
import { EmbeddedParserProp, perfy } from "./util"
/**
* Factory for correctly instantiating {@link Parser} instances. To
* CodeMirror, this class is the `parser`, and a {@link Parser} is the
* running process of said parser.
*/
export class ParserFactory extends CodeMirrorParser {
/** A wrapper function that enables mixed parsing. */
private declare wrapper: ParseWrapper
/**
* @param language - The {@link TarnationLanguage} that this factory
* passes to the {@link Parser} instances it constructs.
*/
constructor(private language: TarnationLanguage) {
super()
this.wrapper = parseMixed(this.nest.bind(this))
}
createParse(
input: Input,
fragments: TreeFragment[],
ranges: { from: number; to: number }[]
) {
const parser = new Parser(this.language, input, fragments, ranges)
return DISABLED_NESTED ? parser : this.wrapper(parser, input, fragments, ranges)
}
/**
* Special "nest" function provided to the `parseMixed` function.
* Determines which nodes indicate a nested parsing region, and if so,
* returns a `NestedParser` for said region.
*/
private nest(cursor: SyntaxNodeRef, input: Input): NestedParse | null {
// don't bother with empty nodes
if (cursor.from === cursor.to) return null
let name: string | undefined
let overlay: { from: number; to: number }[] | undefined
new TreeCursor()
// let's try the configured function first
if (!name && this.language.configure.nest) {
const result = this.language.configure.nest(cursor, input)
if (result) ({ name, overlay } = result)
}
// didn't work (or didn't exist), try the default
// get name from the per-node property, use entire node as range
if (!name) name = cursor.type.prop(EmbeddedParserProp)
// nothing found
if (!name) return null
let langs: readonly LanguageDescription[]
if (!(this.language.nestLanguages instanceof Array)) {
const context = ParseContext.get()
langs = context ? context.state.facet(this.language.nestLanguages) : []
} else {
langs = this.language.nestLanguages
}
const lang = LanguageDescription.matchLanguageName(langs, name)
// language doesn't exist
if (!lang) return null
// language already loaded
if (lang.support) return { parser: lang.support.language.parser, overlay }
// language not loaded yet
return { parser: ParseContext.getSkippingParser(lang.load()), overlay }
}
}
/**
* The `Parser` is the main interface for tokenizing and parsing, and what
* CodeMirror directly interacts with.
*
* Additionally, the `Parser` handles the recovery of grammar state from
* the stale trees provided by CodeMirror, and then uses this data to
* restart tokenization with reused tokens.
*
* Note that the `Parser` is not persistent a objects It is discarded as
* soon as the parse is done. That means that its startup time is very significant.
*/
export class Parser implements PartialParse {
/** The host language. */
private declare language: TarnationLanguage
/**
* An object storing details about the region of the document to be
* parsed, where it was edited, the length, etc.
*/
private declare region: ParseRegion
/** The current state of the grammar, such as the stack. */
private declare state: GrammarState
/** {@link Chunk} buffer, where matched tokens are cached. */
private declare buffer: ChunkBuffer
private declare compiler: Compiler
/**
* A buffer containing the stale _ahead_ state of the tokenized output.
* As in, when a user makes a change, this is all of the tokenization
* data for the previous document after the location of that new change.
*/
private declare previousRight?: ChunkBuffer
/** A function used to measure how long the parse is taking. */
private declare measurePerformance: (msg?: string) => number
/** The current position of the parser. */
declare parsedPos: number
/**
* The position the parser will be stopping at early, if given a location
* to stop at.
*/
declare stoppedAt: number | null
/** The current performance value, in milliseconds. */
declare performance?: number
/**
* @param language - The language containing the grammar to use.
* @param input - The input document to parse.
* @param fragments - The fragments to be used for determining reuse of
* previous parses.
* @param ranges - The ranges of the document to parse.
*/
constructor(
language: TarnationLanguage,
input: Input,
fragments: TreeFragment[],
ranges: { from: number; to: number }[]
) {
this.measurePerformance = perfy()
this.language = language
this.stoppedAt = null
const context = ParseContext.get()
const viewport = context?.viewport ? context.viewport : undefined
this.region = new ParseRegion(input, ranges, fragments, viewport)
// find cached data, if possible
if (REUSE_LEFT && fragments?.length) {
for (let idx = 0; idx < fragments.length; idx++) {
const f = fragments[idx]
// make sure fragment is within the region of the document we care about
if (f.from > this.region.from || f.to < this.region.from) continue
// try to find the buffer for this fragment's tree in the cache
const buffer = this.find(f.tree, this.region.from, f.to)
if (buffer) {
// try to find a suitable chunk from the buffer to restart tokenization from
const { chunk, index } = buffer.search(this.region.edit!.from, -1)
if (chunk && index !== null) {
// split the buffer, reuse the left side,
// but keep the right side around for reuse as well
const { left, right } = buffer.split(index)
this.previousRight = right
this.region.from = chunk.from
this.buffer = left
this.state = chunk.state.clone()
}
}
}
}
this.parsedPos = this.region.from
// if we couldn't reuse state, we'll need to startup things with a default state
if (!this.buffer || !this.state) {
this.buffer = new ChunkBuffer()
this.state = this.language.grammar!.startState()
}
this.compiler = new Compiler(this.language, this.buffer)
// if we reused left, we'll catch the compiler up to the current position
if (this.buffer.chunks.length) this.compiler.advanceFully()
}
/** True if the parser is done. */
get done() {
return this.parsedPos >= this.region.to
}
/**
* Notifies the parser to not progress past the given position.
*
* @param pos - The position to stop at.
*/
stopAt(pos: number) {
this.region.to = pos
this.stoppedAt = pos
}
/** Advances tokenization one step. */
advance(): Tree | null {
// if we're told to stop, we need to BAIL
if (this.stoppedAt && this.parsedPos >= this.stoppedAt) {
return this.finish()
}
this.nextChunk()
this.compiler.step()
if (this.done) return this.finish()
return null
}
private finish(): Tree {
const start = this.region.original.from
const length = this.region.original.length
const tree = this.compiler.compile(start, length)
if (LIMIT_TO_VIEWPORT) {
const context = ParseContext.get()
// inform editor that we skipped everything past the viewport
if (
context &&
!this.stoppedAt &&
this.parsedPos > context.viewport.to &&
this.parsedPos < this.region.original.to
) {
context.skipUntilInView(this.parsedPos, this.region.original.to)
}
}
this.performance = this.measurePerformance()
this.language.performance = this.performance
return tree
}
/** Advances the parser to the next chunk. */
private nextChunk() {
// this condition is a little misleading,
// as we're actually going to break out when any chunk is emitted.
// however, if we're at the "last chunk", this condition catches that
while (this.parsedPos < this.region.to) {
if (REUSE_RIGHT) {
// try to reuse ahead state
const reused = this.previousRight && this.tryToReuse(this.previousRight)
// can't reuse the buffer more than once (pointless)
if (reused) this.previousRight = undefined
}
const pos = this.parsedPos
const startState = this.state.clone()
let matchTokens: GrammarToken[] | null = null
let length = 0
const start = Math.max(pos - MARGIN_BEFORE, this.region.from)
const startCompensated = this.region.compensate(pos, start - pos)
const str = this.region.read(startCompensated, MARGIN_AFTER, this.region.to)
const match = this.language.grammar!.match(this.state, str, pos - start, pos)
if (match) {
this.state = match.state
matchTokens = match.compile()
length = match.length
}
// if we didn't match, we'll advance with an error token to prevent getting stuck
else {
matchTokens = [[NodeID.ERROR_ADVANCE, pos, pos + 1]]
length = 1
}
this.parsedPos = this.region.compensate(pos, length)
let addedChunk = false
for (let idx = 0; idx < matchTokens.length; idx++) {
const t = matchTokens[idx]
if (!this.region.contiguous) {
const from = this.region.compensate(pos, t[1] - pos)
const end = this.region.compensate(pos, t[2] - pos)
t[1] = from
t[2] = end
}
if (this.buffer.add(startState, t)) addedChunk = true
}
if (addedChunk) return true
}
return false
}
/**
* Tries to reuse a buffer _ahead_ of the current position. Returns true
* if this was successful, otherwise false.
*
* @param right - The buffer to try and reuse.
*/
private tryToReuse(right: ChunkBuffer) {
// can't reuse if we don't know the safe regions
if (!this.region.edit) return false
// can only safely reuse if we're ahead of the edited region
if (this.parsedPos <= this.region.edit.to) return false
// check every chunk and see if we can reuse it
for (let idx = 0; idx < right.chunks.length; idx++) {
const chunk = right.chunks[idx]
if (chunk.isReusable(this.state, this.parsedPos, this.region.edit.offset)) {
right.slide(idx, this.region.edit.offset, true)
this.buffer.link(right, this.region.original.length)
this.buffer.ensureLast(this.parsedPos, this.state)
this.state = this.buffer.last!.state.clone()
this.parsedPos = this.buffer.last!.from
return true
}
}
return false
}
/**
* Returns the first chunk buffer found within a tree, if any.
*
* @param tree - The tree to search through, recursively.
* @param from - The start of the search area.
* @param to - The end of the search area.
* @param offset - An offset added to the tree's positions, so that they
* may match some other source's positions.
*/
private find(tree: Tree, from: number, to: number, offset = 0): ChunkBuffer | null {
const bundle =
offset >= from && offset + tree.length >= to
? tree.prop(this.language.stateProp!)
: undefined
if (bundle) return bundle
// recursively check children
for (let i = tree.children.length - 1; i >= 0; i--) {
const child = tree.children[i]
const pos = offset + tree.positions[i]
if (!(child instanceof Tree && pos < to)) continue
const found = this.find(child, from, to, pos)
if (found) return found
}
return null
}
}