UNPKG

langxlang

Version:

LLM wrapper for OpenAI GPT and Google Gemini and PaLM 2 models

754 lines (727 loc) 27.3 kB
// Stripping here refers to removing unnecessary tokens, comments and white-space in a program to minimize the amount of tokens // that are needed to represent the program. In languages like Java, there's lots of syntax tokens that are needed for the program // to run, but not needed for the purpose of abstractly understanding program logic. Think things like public/private, final, etc. function removeExtraLines (str) { return str.replace(/\n{3,}/g, '\n\n') } function normalizeLineEndings (str) { return str.replace(/\r\n/g, '\n') } function count (str, char) { let c = 0 for (const s of str) { if (s === char) c++ } return c } function countStart (str, char) { let c = 0 for (const s of str) { if (s === char) c++ else break } return c } function stripXmlComments (text) { return text.replace(/<!--[\s\S]*?-->/g, '') } function stripMdpComments (text) { return text.replace(/<!---[\s\S]*?-->/g, '') } function stripJava (code, options) { // First, we need to "tokenize" the code, by splitting it into 3 types of data: comments, strings, and code. const tokens = [] let tokenSoFar = '' let currentTokenType = 'code' // 'code' or 'multi-line-comment', 'single-line-comment' or 'string' for (let i = 0; i < code.length; i++) { const lastChar = code[i - 1] const currentChar = code[i] const nextChar = code[i + 1] if (currentTokenType === 'code') { if (currentChar === '/' && nextChar === '*') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'multi-line-comment' } else if (currentChar === '/' && nextChar === '/') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'single-line-comment' } else if (currentChar === '"') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'string' } else { tokenSoFar += currentChar } } else if (currentTokenType === 'multi-line-comment') { if (currentChar === '*' && nextChar === '/') { tokens.push([tokenSoFar + '*/', currentTokenType]) tokenSoFar = '' currentTokenType = 'code' i++ } else { tokenSoFar += currentChar } } else if (currentTokenType === 'single-line-comment') { if (currentChar === '\n') { tokens.push([tokenSoFar + '\n', currentTokenType]) tokenSoFar = '' currentTokenType = 'code' } else { tokenSoFar += currentChar } } else if (currentTokenType === 'string') { if (currentChar === '"' && lastChar !== '\\') { tokens.push([tokenSoFar + '"', currentTokenType]) tokenSoFar = '' currentTokenType = 'code' } else { tokenSoFar += currentChar } } } // Now we have an array of tokens, where every other token is a comment or string, and the others are code. tokens.push([tokenSoFar, currentTokenType]) // Now we can remove the keyword tokens that we don't want to keep. The always have spaces around them, so nothing fancy is needed. const syntaxTokensToRemove = options.tokensToRemove || ['protected', 'private', 'public', 'final', 'abstract', 'synchronized', 'volatile', 'transient', 'native', 'strictfp'] const ANNO_MARK = '//annotationForRemoval/ ' for (const entry of tokens) { if (options.removeAnnotations) { if (entry[1] === 'code') { // console.log('Removing annotations') const lines = entry[0].split('\n') const newLines = [] for (const line of lines) { if (line.trim().startsWith('@')) { newLines.push(ANNO_MARK + line) // mark for later removal continue } newLines.push(line) } entry[0] = newLines.join('\n') } } if (entry[1] === 'code') { for (const forRemoval of syntaxTokensToRemove) { entry[0] = entry[0].replace(new RegExp('\\b' + forRemoval + ' ', 'g'), '') } } } // Now we can replace some user specified tokens with other tokens. Useful for renaming variables if (options.replacements) { for (const entry of tokens) { if (entry[1] === 'code') { for (const [old, now] of options.replacements) { entry[0] = old instanceof RegExp ? entry[0].replace(old, now) : entry[0].replaceAll(old, now) } } } } // First, make a new set of tokens, removing comments if the user wants let newTokens = [] for (const [tokenStr, tokenType] of tokens) { if (options.removeComments && (tokenType === 'multi-line-comment' || tokenType === 'single-line-comment')) { continue } newTokens.push([tokenStr, tokenType]) } // update the newTokens to merge adjacent code tokens (needed for correct space handling) for (let i = 0; i < newTokens.length - 1; i++) { const [tokenStr, tokenType] = newTokens[i] const [nextTokenStr, nextTokenType] = newTokens[i + 1] if (tokenType === 'code' && nextTokenType === 'code') { newTokens[i + 1][0] = tokenStr + nextTokenStr newTokens[i][0] = '' } } newTokens = newTokens.filter(([tokenStr, tokenType]) => tokenStr !== '') if (options.removeStrings) { // turn strings to empty strings newTokens = newTokens.map(([tokenStr, tokenType]) => tokenType === 'string' ? ['""', tokenType] : [tokenStr, tokenType]) } // Now iterate through the new tokens and remove code with empty space lines let result = '' for (let i = 0; i < newTokens.length; i++) { const [tokenStr, tokenType] = newTokens[i] if (tokenType === 'code') { const newStrLines = [] const split = tokenStr.split('\n') for (let j = 0; j < split.length; j++) { // skip trimming the last line, prevent issues with the next token if (j === split.length - 1) { newStrLines.push(split[j]) continue } const line = split[j] if (line.trim() === '') continue newStrLines.push(line) } const now = newStrLines.join('\n') result += now } else { result += tokenStr } } const lines = result.split('\n') const finalLines = [] for (const line of lines) { if (options.removeAnnotations) { if (line.trim().startsWith(ANNO_MARK)) { continue } else if (line.includes(ANNO_MARK)) { finalLines.push(line.split(ANNO_MARK)[1]) continue } } finalLines.push(line) } return finalLines.join('\n') } function stripPHP (code, options = {}) { // First, we need to "tokenize" the code, by splitting it into 3 types of data: comments, strings, and code. const tokens = [] let tokenSoFar = '' // 'code' or 'multi-line-comment', 'single-line-comment' or 'double-quote-string', 'single-quote-string', 'heredoc-string', 'nowdoc-string' let currentTokenType = 'code' let currentTokenData = '' for (let i = 0; i < code.length; i++) { const lastChar = code[i - 1] const currentChar = code[i] const nextChar = code[i + 1] const nextNextChar = code[i + 2] if (currentTokenType === 'code') { if (currentChar === '/' && nextChar === '*') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'multi-line-comment' } else if (currentChar === '/' && nextChar === '/') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'single-line-comment' } else if (currentChar === '"') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'double-quote-string' } else if (currentChar === "'") { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'single-quote-string' } else if (currentChar === '<' && nextChar === '<' && nextNextChar === '<') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar + nextChar + nextNextChar i += 2 const end = code.indexOf('\n', i) currentTokenData = code.substring(i, end).trim() tokenSoFar += currentTokenData if (currentTokenData.startsWith("'")) { currentTokenType = 'nowdoc-string' currentTokenData = currentTokenData.slice(1, -1) } else { currentTokenType = 'heredoc-string' } i = end } else { tokenSoFar += currentChar } } else if (currentTokenType === 'multi-line-comment') { if (currentChar === '*' && nextChar === '/') { tokens.push([tokenSoFar + '*/', currentTokenType]) tokenSoFar = '' currentTokenType = 'code' i++ } else { tokenSoFar += currentChar } } else if (currentTokenType === 'single-line-comment') { if (currentChar === '\n') { tokens.push([tokenSoFar + '\n', currentTokenType]) tokenSoFar = '' currentTokenType = 'code' } else { tokenSoFar += currentChar } } else if (currentTokenType === 'double-quote-string' || currentTokenType === 'single-quote-string') { if (currentChar === (currentTokenType === 'double-quote-string' ? '"' : "'") && lastChar !== '\\') { tokens.push([tokenSoFar + currentChar, currentTokenType]) tokenSoFar = '' currentTokenType = 'code' } else { tokenSoFar += currentChar } } else if (currentTokenType === 'heredoc-string' || currentTokenType === 'nowdoc-string') { if (code.startsWith(currentTokenData, i) && (code[i + currentTokenData.length] === '\n' || code[i + currentTokenData.length] === ';')) { tokenSoFar += currentTokenData i += currentTokenData.length - 1 tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = '' currentTokenType = 'code' } else { tokenSoFar += currentChar } } } tokens.push([tokenSoFar, currentTokenType]) // Now we can remove the keyword tokens that are not important for abstractly understanding the program const syntaxTokensToRemove = options.tokensToRemove || ['public', 'private', 'protected', 'final', 'readonly'] for (const entry of tokens) { if (entry[1] === 'code') { for (const forRemoval of syntaxTokensToRemove) { entry[0] = entry[0].replace(new RegExp('\\b' + forRemoval + ' ', 'g'), '') } } } // Now we can replace some user specified tokens with other tokens. Useful for renaming variables if (options.replacements) { for (const entry of tokens) { if (entry[1] === 'code') { for (const [old, now] of options.replacements) { entry[0] = old instanceof RegExp ? entry[0].replace(old, now) : entry[0].replaceAll(old, now) } } } } // First, make a new set of tokens, removing comments if the user wants let newTokens = [] for (const [tokenStr, tokenType] of tokens) { if (options.removeComments && (tokenType === 'multi-line-comment' || tokenType === 'single-line-comment')) { continue } newTokens.push([tokenStr, tokenType]) } // update the newTokens to merge adjacent code tokens (needed for correct space handling) for (let i = 0; i < newTokens.length - 1; i++) { const [tokenStr, tokenType] = newTokens[i] const [nextTokenStr, nextTokenType] = newTokens[i + 1] if (tokenType === 'code' && nextTokenType === 'code') { newTokens[i + 1][0] = tokenStr + nextTokenStr newTokens[i][0] = '' } } newTokens = newTokens.filter(([tokenStr, tokenType]) => tokenStr !== '') // Now iterate through the new tokens and remove code with empty space lines let result = '' for (let i = 0; i < newTokens.length; i++) { const [tokenStr, tokenType] = newTokens[i] if (tokenType === 'code') { const newStrLines = [] for (const line of tokenStr.split('\n')) { if (line.trim() === '') continue newStrLines.push(line) } const now = newStrLines.join('\n') result += now } else { result += tokenStr } } return result } function stripGo (code, options) { const tokens = [] let tokenSoFar = '' let currentTokenType = 'code' for (let i = 0; i < code.length; i++) { const lastChar = code[i - 1] const currentChar = code[i] const nextChar = code[i + 1] if (currentTokenType === 'code') { if (currentChar === '/' && nextChar === '*') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'multi-line-comment' } else if (currentChar === '/' && nextChar === '/') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'single-line-comment' } else if (currentChar === '"') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'double-quote-string' } else if (currentChar === "'") { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'single-quote-string' } else if (currentChar === '`') { tokens.push([tokenSoFar, currentTokenType]) tokenSoFar = currentChar currentTokenType = 'raw-string' } else { tokenSoFar += currentChar } } else if (currentTokenType === 'multi-line-comment') { if (currentChar === '*' && nextChar === '/') { tokens.push([tokenSoFar + '*/', currentTokenType]) tokenSoFar = '' currentTokenType = 'code' i++ } else { tokenSoFar += currentChar } } else if (currentTokenType === 'single-line-comment') { if (currentChar === '\n') { tokens.push([tokenSoFar + '\n', currentTokenType]) tokenSoFar = '' currentTokenType = 'code' } else { tokenSoFar += currentChar } } else if (currentTokenType === 'double-quote-string' || currentTokenType === 'single-quote-string') { if (currentChar === (currentTokenType === 'double-quote-string' ? '"' : "'") && lastChar !== '\\') { tokens.push([tokenSoFar + currentChar, currentTokenType]) tokenSoFar = '' currentTokenType = 'code' } else { tokenSoFar += currentChar } } else if (currentTokenType === 'raw-string') { if (currentChar === '`') { tokens.push([tokenSoFar + currentChar, currentTokenType]) tokenSoFar = '' currentTokenType = 'code' } else { tokenSoFar += currentChar } } } // Go doesn't have a lot of syntax tokens that can be removed, so we'll just remove comments and whitespace tokens.push([tokenSoFar, currentTokenType]) // Now we can replace some user specified tokens with other tokens. Useful for renaming variables if (options.replacements) { for (const entry of tokens) { if (entry[1] === 'code') { for (const [old, now] of options.replacements) { entry[0] = old instanceof RegExp ? entry[0].replace(old, now) : entry[0].replaceAll(old, now) } } } } // First, make a new set of tokens, removing comments if the user wants let newTokens = [] for (const [tokenStr, tokenType] of tokens) { if (options.removeComments && (tokenType === 'multi-line-comment' || tokenType === 'single-line-comment')) { continue } newTokens.push([tokenStr, tokenType]) } // update the newTokens to merge adjacent code tokens (needed for correct space handling) for (let i = 0; i < newTokens.length - 1; i++) { const [tokenStr, tokenType] = newTokens[i] const [nextTokenStr, nextTokenType] = newTokens[i + 1] if (tokenType === 'code' && nextTokenType === 'code') { newTokens[i + 1][0] = tokenStr + nextTokenStr newTokens[i][0] = '' } } newTokens = newTokens.filter(([tokenStr, tokenType]) => tokenStr !== '') // Now iterate through the new tokens and remove code with empty space lines let result = '' for (let i = 0; i < newTokens.length; i++) { const [tokenStr, tokenType] = newTokens[i] if (tokenType === 'code') { const newStrLines = [] for (const line of tokenStr.split('\n')) { if (line.trim() === '') continue newStrLines.push(line) } const now = newStrLines.join('\n') result += now } else { result += tokenStr } } return result } function removeNonAscii (str) { return str.replace(/[^\x00-\x7F]/g, '') // eslint-disable-line no-control-regex } function removeSpecialUnicode (str) { // Keeps ASCII, extended Unicode for other languages, spaces, punctuation, and emojis return str.replace(/[^\p{L}\p{N}\p{Z}\p{P}\p{S}\p{Sc}\p{Sk}\p{So}\p{Sm}\t\n]/gu, '') } function strOnlyContainsCharExcludingWhitespace (str, char) { let found = false for (const c of str) { if (c === char) { found = true } else if (c !== ' ' && c !== '\t') { return false } } return found } function tokenizeMarkdown (comment, options) { const tokens = [] let tokenSoFar = '' let inCodeBlock = false let inCodeLang let inPreTag = false let linePadding = 0 for (let i = 0; i < comment.length; i++) { const currentChar = comment[i] const lastChar = comment[i - 1] const slice = comment.slice(i) if (lastChar === '\n') { linePadding = countStart(slice.replace('\t', ' '), ' ') } if (inPreTag) { if (slice.startsWith('</pre>')) { tokens.push([tokenSoFar + '</pre>', 'pre']) i += 5 inPreTag = false tokenSoFar = '' } else { tokenSoFar += currentChar } } else if (inCodeBlock) { // This handles backticks closing code blocks. It's tricky as the markdown spec isn't clear on this. // On top of that, once LLMs start generating text (for example with 3 backticks), they can't backtrack // and add more backticks to the enclosing codeblock to avoid escaping problems. This means it is not // possible to ascertain starting and ending code blocks just by looking at n-back tick chars, we must // also on top make sure the padding for the start/stop backtick'ed lines are the same. This seems to work // well and also handles tabulation, for example a paragraph that's got 1-3 spaces of indent (4+ would be a pre block). if (slice.startsWith(inCodeBlock.tag) && (inCodeBlock.padding === linePadding)) { const code = tokenSoFar.slice(inCodeBlock.tag.length + inCodeLang.length + 1) // +1 for the newline after ``` tokens.push([tokenSoFar + inCodeBlock.tag, 'code', inCodeLang, code]) i += inCodeBlock.tag.length inCodeBlock = false tokenSoFar = '' } else { tokenSoFar += currentChar } } else { if (lastChar === '\n' && slice.startsWith(' ')) { // Handle tab preformatted text blocks. // This is a bit tricky as we need to check if the last line is empty or a markdown header before // we can allow a preformatted block to start. Also, multiple subsequent preformatted blocks should // be concatenated, or even text blocks if they are empty, so we have to concat afterwards in postproc. const lastLine = tokenSoFar.slice(0, -1).split('\n').pop() if (lastLine.trim() === '' || lastLine.startsWith('#')) { // 4-space code block for this whole line tokens.push([tokenSoFar, 'text']) tokenSoFar = '' let lineEnd = slice.indexOf('\n') if (lineEnd === -1) lineEnd = slice.length const raw = slice.slice(0, lineEnd + 1) const code = slice.slice(4, lineEnd) tokens.push([raw, 'preformat', code]) i += lineEnd continue } } if (slice.startsWith('<!--')) { // Comment const end = slice.indexOf('-->') if (end === -1) { if (options.allowMalformed) { tokens.push([tokenSoFar, 'text']) tokens.push([slice, 'comment']) break } else { throw new Error('Unmatched markdown comment') } } else { tokens.push([tokenSoFar, 'text']) tokens.push([slice.slice(0, end + 3), 'comment']) i += end + 2 tokenSoFar = '' } continue } const preMatch = slice.match(/^<pre>/) const codeMatch = slice.match(/^([`]{3,})([a-zA-Z]*)\n/) if (codeMatch) { tokens.push([tokenSoFar, 'text']) inCodeBlock = { tag: codeMatch[1], padding: linePadding } inCodeLang = codeMatch[2] tokenSoFar = codeMatch[0] i += tokenSoFar.length - 1 } else if (preMatch) { tokens.push([tokenSoFar, 'text']) inPreTag = true tokenSoFar = preMatch[0] i += tokenSoFar.length - 1 } else { tokenSoFar += currentChar } } } if (inCodeBlock) { if (options.allowMalformed) { tokens.push([tokenSoFar, 'text']) } else { throw new Error('Unmatched code block') } } tokens.push([tokenSoFar, 'text']) // Now we need to merge adjacent preformatted blocks or preformatted blocks with spacing text blocks between const updated = [] for (let i = 0; i < tokens.length; i++) { const token = tokens[i] if (token[1] === 'preformat') { let intermediateEmptyLines = '' for (let j = i + 1; j < tokens.length; j++) { const nextToken = tokens[j] if (nextToken[1] === 'preformat') { if (intermediateEmptyLines) { const lineCount = count(intermediateEmptyLines, '\n') token[0] += intermediateEmptyLines token[2] += '\n'.repeat(lineCount) intermediateEmptyLines = '' } token[0] += nextToken[0] token[2] += '\n' + nextToken[2] i = j } else if (nextToken[1] === 'text' && nextToken[0].trim() === '') { intermediateEmptyLines += nextToken[0] } else { break } } } updated.push(token) } return updated } // This mainly erases extraneous new lines outside of code blocks, including ones with empty block quotes function stripMarkdown (comment, options = {}) { if (!comment) return '' comment = normalizeLineEndings(comment) comment = removeSpecialUnicode(comment) // First, split by any codeblocks const tokens = tokenizeMarkdown(comment, options) // Now go through the tokens const updated = [] for (const token of tokens) { if (token[1] === 'code') { // Don't update code updated.push(token[0]) } else { // Replace \n\n or any extra \n's with one \n let update = removeExtraLines(token[0]) if (options.replacements) { for (const replacement of options.replacements) { update = replacement[0] instanceof RegExp ? update.replace(replacement[0], replacement[1]) : update.replaceAll(replacement[0], replacement[1]) } } const final = [] for (const line of update.split('\n')) { const tline = line.trim() if (tline === '') continue if (options.stripEmailQuotes) { if (tline.startsWith('On ') && tline.endsWith('> wrote:')) { break } } // if the line only has ">" blockquote characters, skip it if (strOnlyContainsCharExcludingWhitespace(tline, '>')) { continue } final.push(line) } updated.push(final.join('\n')) } } const result = updated.join('\n') return result.trim() } const DEFAULT_EXCLUDE = [/node_modules/, /\.git/, /\/build\//, /\/dist\//] function stripDiff (diff, options = {}) { const exclude = options.excluding || DEFAULT_EXCLUDE const lines = diff.split('\n') const inter = [] let inExcluded = false for (let i = 0; i < lines.length; i++) { const line = lines[i] const nextLine = lines[i + 1] if (line.startsWith('diff --git')) { inExcluded = exclude.some((ex) => ex.test(line)) if (options.matching) { const file = line.split(' b/')[1] let mode = 'modified' if (nextLine.startsWith('new file')) mode = 'created' else if (nextLine.startsWith('deleted file')) mode = 'deleted' const matching = options.matching(file, mode, inExcluded) if (matching === false) { inExcluded = true continue } } if (inExcluded) { // Treat this as a binary file inter.push(line) inter.push('index 0000000..0000000') inter.push('Binary files differ') } } if (inExcluded) { continue } inter.push(line) } const regions = [] let currentFile let currentFileIx let currentFileContentsIx for (let i = 0; i < inter.length; i++) { const line = inter[i] if (line.startsWith('diff --git')) { if (currentFile) { regions.push({ file: currentFile.trim(), start: currentFileIx, fileStart: currentFileContentsIx, end: i }) currentFileContentsIx = null } currentFile = line currentFileIx = i } if (line.startsWith('@@')) { currentFileContentsIx ||= i } } regions.reverse() // we want to start from the bottom const SIG_PLUS = '\t\t \t' const SIG_MINUS = '\t \t\t' const SUB_KEYWORD = `$STORED_${(Math.random() * 1000) | 0}_` if (options.stripDiffFiles) { function stripFile (region, usingMethod) { const storedVariables = [] const slice = inter.slice(region.fileStart, region.end) .map((line) => { // We need to convert the git diff to normal Java so it can be stripped. But we need to keep the git data like @/+/- // so we either sub+map and store or add a prefix signature (spacing is ignored so we can add a space based prefix) if (line.startsWith('@@')) { const forStore = line.split(' @@') storedVariables.push(forStore[0] + ' @@') return SUB_KEYWORD + storedVariables.length + forStore[1] } else if (line.startsWith('+')) { return SIG_PLUS + line.slice(1) } else if (line.startsWith('-')) { return SIG_MINUS + line.slice(1) } return line }) const sliceStr = slice.join('\n') let stripped = usingMethod(sliceStr, options) .replaceAll(SIG_PLUS, '+') .replaceAll(SIG_MINUS, '-') for (let i = storedVariables.length - 1; i >= 0; i--) { stripped = stripped.replace(SUB_KEYWORD + (i + 1), storedVariables[i]) } const strippedLines = stripped.split('\n') inter.splice(region.fileStart, region.end - region.fileStart, ...strippedLines) } for (const region of regions) { if (!region.fileStart) continue if (region.file.endsWith('.java')) stripFile(region, stripJava) } } const result = inter.join('\n') return result } module.exports = { stripJava, stripPHP, stripGo, stripMarkdown, stripDiff, removeNonAscii, normalizeLineEndings, tokenizeMarkdown, stripXmlComments, stripMdpComments }