UNPKG

devcontext

Version:

DevContext is a cutting-edge Model Context Protocol (MCP) server designed to provide developers with continuous, project-centric context awareness.

1,820 lines (1,622 loc) 72.9 kB
/** * TextTokenizerLogic.js * * Provides text tokenization with language-specific enhancements * for more accurate code analysis and context understanding. */ /** * Tokenizes input text with language-specific tokenization rules * * @param {string} text - The text to tokenize * @param {string} language - The programming language of the text (default: 'plaintext') * @returns {string[]} An array of tokens */ export function tokenize(text, language = "plaintext") { // Normalize to lowercase as requested const normalizedText = text.toLowerCase(); // Handle language-specific tokenization based on the language parameter switch (language) { case "javascript": case "typescript": case "jsx": case "tsx": return tokenizeJavaScript(normalizedText); case "python": return tokenizePython(normalizedText); case "java": case "csharp": case "c#": return tokenizeJavaLike(normalizedText); case "ruby": return tokenizeRuby(normalizedText); case "go": return tokenizeGo(normalizedText); case "plaintext": default: return tokenizeGeneric(normalizedText); } } /** * Generates n-grams from an array of tokens, respecting semantic boundaries where possible * * @param {string[]} tokens - Array of tokens (from tokenize function) * @param {number} n - Size of n-grams to generate (e.g., 2 for bigrams, 3 for trigrams) * @returns {string[]} Array of n-gram strings */ export function generateNgrams(tokens, n) { // Handle edge cases if (!tokens || tokens.length === 0) return []; if (n <= 0) return []; if (tokens.length < n) return [tokens.join(" ")]; const ngrams = []; // Track positions where we should avoid generating n-grams // These represent semantic boundaries const semanticBoundaries = new Set(); // Identify potential semantic boundaries for (let i = 0; i < tokens.length; i++) { const token = tokens[i]; // Break at special tokens that indicate syntactic boundaries if (token.startsWith("__") && token.endsWith("__")) { semanticBoundaries.add(i); semanticBoundaries.add(i + 1); } // Break at common punctuation that signals the end of statements if ([";", ".", "{", "}", "(", ")", "[", "]"].includes(token)) { semanticBoundaries.add(i); semanticBoundaries.add(i + 1); } } // Generate n-grams by sliding a window of size n over the tokens array // Skip windows that cross semantic boundaries for (let i = 0; i <= tokens.length - n; i++) { // Check if any semantic boundary exists within this window let hasBoundary = false; for (let j = i; j < i + n - 1; j++) { if (semanticBoundaries.has(j + 1)) { hasBoundary = true; break; } } // If no semantic boundaries in this window, generate the n-gram if (!hasBoundary) { const ngram = tokens.slice(i, i + n).join(" "); ngrams.push(ngram); } } return ngrams; } /** * Extracts n-grams from an array of tokens (alias for generateNgrams) * * @param {string[]} tokens - Array of tokens (from tokenize function) * @param {number} n - Size of n-grams to generate (e.g., 2 for bigrams, 3 for trigrams) * @returns {string[]} Array of n-gram strings */ export function extractNGrams(tokens, n) { return generateNgrams(tokens, n); } /** * Identifies language-specific idioms in code text * * @param {string} text - Raw text to analyze * @param {string} language - Programming language of the text * @returns {{idiom: string, type: string, location: {start: number, end: number}}[]} Array of identified idioms */ export function identifyLanguageSpecificIdioms(text, language) { // Handle empty input if (!text) return []; const idioms = []; // Normalize language parameter const normalizedLanguage = language.toLowerCase(); // Use language-specific idiom detection switch (normalizedLanguage) { case "javascript": case "typescript": case "jsx": case "tsx": identifyJavaScriptIdioms(text, idioms); break; case "python": identifyPythonIdioms(text, idioms); break; case "csharp": case "c#": identifyCSharpIdioms(text, idioms); break; // Add more languages as needed } return idioms; } /** * Identifies JavaScript-specific idioms * * @param {string} text - JavaScript code text * @param {Array} idioms - Array to add identified idioms to * @private */ function identifyJavaScriptIdioms(text, idioms) { // 1. Detect Promise chains (.then().catch()) const promiseChainRegex = /\.\s*then\s*\(\s*(?:function\s*\([^)]*\)|[^=>(]*=>\s*[^)]*)\s*\)(?:\s*\.(?:then|catch|finally)\s*\([^)]*\))+/g; let match; while ((match = promiseChainRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "js_promise_chain", location: { start: match.index, end: match.index + match[0].length, }, }); } // 2. Detect async/await usage const asyncAwaitRegex = /\basync\s+(?:function\s*[a-zA-Z0-9_$]*\s*\([^)]*\)|(?:[a-zA-Z0-9_$]+\s*=>)|(?:\([^)]*\)\s*=>))(?:(?:.|\n)*?\bawait\b(?:.|\n)*?)/g; while ((match = asyncAwaitRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "js_async_await", location: { start: match.index, end: match.index + match[0].length, }, }); } // 3. Detect arrow functions as callbacks const arrowCallbackRegex = /(?:\.|\()(?:[a-zA-Z0-9_$]+)?\s*\(\s*(?:\([^)]*\)|[a-zA-Z0-9_$]+)\s*=>\s*(?:{[^}]*}|[^);,]*)/g; while ((match = arrowCallbackRegex.exec(text)) !== null) { // Avoid duplicate detection with Promise chains const isDuplicate = idioms.some( (idiom) => idiom.type === "js_promise_chain" && match.index >= idiom.location.start && match.index + match[0].length <= idiom.location.end ); if (!isDuplicate) { idioms.push({ idiom: match[0], type: "js_arrow_callback", location: { start: match.index, end: match.index + match[0].length, }, }); } } } /** * Identifies Python-specific idioms * * @param {string} text - Python code text * @param {Array} idioms - Array to add identified idioms to * @private */ function identifyPythonIdioms(text, idioms) { // 1. Detect list comprehensions const listComprehensionRegex = /\[\s*[^\[\]]*\s+for\s+[^\[\]]+\s+in\s+[^\[\]]+(?:\s+if\s+[^\[\]]+)?\s*\]/g; let match; while ((match = listComprehensionRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "python_list_comprehension", location: { start: match.index, end: match.index + match[0].length, }, }); } // 2. Detect dictionary comprehensions const dictComprehensionRegex = /\{\s*[^{}]*\s*:\s*[^{}]*\s+for\s+[^{}]+\s+in\s+[^{}]+(?:\s+if\s+[^{}]+)?\s*\}/g; while ((match = dictComprehensionRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "python_dict_comprehension", location: { start: match.index, end: match.index + match[0].length, }, }); } // 3. Detect lambda functions const lambdaRegex = /lambda\s+[^:]+:[^,\n)]+/g; while ((match = lambdaRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "python_lambda", location: { start: match.index, end: match.index + match[0].length, }, }); } // 4. Detect generator expressions const generatorRegex = /\(\s*[^()]*\s+for\s+[^()]+\s+in\s+[^()]+(?:\s+if\s+[^()]+)?\s*\)/g; while ((match = generatorRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "python_generator_expression", location: { start: match.index, end: match.index + match[0].length, }, }); } } /** * Identifies C#-specific idioms * * @param {string} text - C# code text * @param {Array} idioms - Array to add identified idioms to * @private */ function identifyCSharpIdioms(text, idioms) { // 1. Detect LINQ queries with method syntax const linqMethodRegex = /\.\s*(?:Where|Select|OrderBy|OrderByDescending|GroupBy|Join|Skip|Take|First|FirstOrDefault|Any|All|Count)\s*\(\s*[^)]*\)(?:\s*\.\s*(?:Where|Select|OrderBy|OrderByDescending|GroupBy|Join|Skip|Take|First|FirstOrDefault|Any|All|Count)\s*\(\s*[^)]*\))*/g; let match; while ((match = linqMethodRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "csharp_linq_method", location: { start: match.index, end: match.index + match[0].length, }, }); } // 2. Detect LINQ queries with query syntax const linqQueryRegex = /from\s+\w+\s+in\s+[^{]+(?:where\s+[^{]+)?(?:orderby\s+[^{]+)?(?:select\s+[^{;]+)?(?:group\s+[^{;]+by\s+[^{;]+)?/g; while ((match = linqQueryRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "csharp_linq_query", location: { start: match.index, end: match.index + match[0].length, }, }); } // 3. Detect async/await patterns const asyncAwaitRegex = /\basync\s+[^(]*\([^)]*\)(?:\s*<[^>]*>)?\s*(?:=>)?\s*{(?:(?:.|\n)*?\bawait\b(?:.|\n)*?)}/g; while ((match = asyncAwaitRegex.exec(text)) !== null) { idioms.push({ idiom: match[0], type: "csharp_async_await", location: { start: match.index, end: match.index + match[0].length, }, }); } // 4. Detect lambda expressions const lambdaRegex = /(?:\([^)]*\)|\w+)\s*=>\s*(?:{[^}]*}|[^;]+)/g; while ((match = lambdaRegex.exec(text)) !== null) { // Avoid duplicate detection with LINQ methods const isDuplicate = idioms.some( (idiom) => (idiom.type === "csharp_linq_method" || idiom.type === "csharp_linq_query") && match.index >= idiom.location.start && match.index + match[0].length <= idiom.location.end ); if (!isDuplicate) { idioms.push({ idiom: match[0], type: "csharp_lambda", location: { start: match.index, end: match.index + match[0].length, }, }); } } } /** * Extracts keywords from an array of tokens with language-specific enhancements * * @param {string[]} tokens - Array of tokens (from tokenize function) * @param {number} topN - Number of top keywords to return (default: 10) * @param {string} language - Programming language hint (default: 'plaintext') * @returns {{keyword: string, score: number}[]} Array of keywords with scores */ export function extractKeywords(tokens, topN = 10, language = "plaintext") { // Get language-specific stop words const stopWords = getStopWords(language); // Calculate term frequencies const termFrequencies = {}; for (const token of tokens) { if (!termFrequencies[token]) { termFrequencies[token] = 0; } termFrequencies[token]++; } // Apply scoring heuristics const scoredKeywords = []; for (const [token, frequency] of Object.entries(termFrequencies)) { // Skip stop words unless they're part of something significant // (e.g., longer than typical stop words or contain special characters) if (stopWords.has(token) && token.length < 6 && !/[_\-$#@]/.test(token)) { continue; } // Base score is the term frequency let score = frequency; // Boost domain-specific tokens (identifiers) if (isDomainSpecificToken(token, language)) { score *= 2.0; } // Boost longer words (they tend to be more meaningful) if (token.length > 6) { score *= 1.5; } // Boost tokens with special characters that are likely important in code if (/[_$]/.test(token)) { score *= 1.2; } // Penalize very short tokens that aren't likely to be meaningful if (token.length < 3 && !/[_\-$#@]/.test(token)) { score *= 0.5; } // Additional boosts for language-specific patterns score = applyLanguageSpecificBoosts(token, score, language); scoredKeywords.push({ keyword: token, score: score, }); } // Sort by score (descending) and return top N return scoredKeywords.sort((a, b) => b.score - a.score).slice(0, topN); } /** * Determines if a token is likely a domain-specific identifier * * @param {string} token - The token to check * @param {string} language - The programming language * @returns {boolean} True if the token appears to be domain-specific */ function isDomainSpecificToken(token, language) { // Check for common patterns that indicate domain-specific tokens // CamelCase or PascalCase (common in most languages) if (/[a-z][A-Z]/.test(token) || /^[A-Z][a-z]/.test(token)) { return true; } // snake_case (common in Python, Ruby) if (token.includes("_") && token.length > 4) { return true; } // Special prefixes/patterns common in various languages if (/^(on|handle|process|get|set|is|has|should|with)/i.test(token)) { return true; } // Tokens with numbers are often domain-specific if (/[a-z][0-9]/.test(token)) { return true; } // JavaScript/TypeScript specific if ( (language === "javascript" || language === "typescript") && (/\$/.test(token) || // Angular, jQuery /^use[A-Z]/.test(token)) ) { // React hooks return true; } // Python specific if ( language === "python" && (/^__.*__$/.test(token) || // dunder methods /^self\./.test(token)) ) { // instance attributes return true; } return false; } /** * Apply language-specific score boosts to tokens * * @param {string} token - The token to apply boosts to * @param {number} score - The current score * @param {string} language - The programming language * @returns {number} The updated score */ function applyLanguageSpecificBoosts(token, score, language) { switch (language) { case "javascript": case "typescript": case "jsx": case "tsx": // Boost React/component related terms if ( /^(use|component|props|state|render|effect|memo|callback)/.test(token) ) { score *= 1.5; } // Boost event handler patterns if (/^(on[A-Z]|handle[A-Z])/.test(token)) { score *= 1.3; } break; case "python": // Boost important Python patterns if (/^(def|class|self|super|__init__|__main__)/.test(token)) { score *= 1.3; } // Boost decorators if (/^@/.test(token)) { score *= 1.4; } break; case "java": case "csharp": case "c#": // Boost important Java/C# patterns if ( /^(public|private|protected|static|final|override|virtual|abstract)/.test( token ) ) { score *= 1.2; } // Boost class/interface/enum declarations if (/^(class|interface|enum|record|struct)/.test(token)) { score *= 1.3; } break; case "ruby": // Boost Ruby-specific patterns if (/^(attr_|def|class|module|require|include|extend)/.test(token)) { score *= 1.3; } // Boost symbols if (/^:/.test(token)) { score *= 1.2; } break; case "go": // Boost Go-specific patterns if (/^(func|struct|interface|type|go|chan|defer|goroutine)/.test(token)) { score *= 1.3; } break; } return score; } /** * Get stop words for the specified language * * @param {string} language - The programming language * @returns {Set<string>} Set of stop words */ function getStopWords(language) { // Common English stop words const commonStopWords = new Set([ "a", "an", "the", "and", "or", "but", "if", "then", "else", "when", "at", "from", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "is", "am", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would", "should", "could", "ought", "i", "you", "he", "she", "it", "we", "they", "their", "this", "that", "these", "those", "of", "in", "as", "on", "not", "no", "its", "his", "her", ]); // Common programming language keywords const commonProgrammingStopWords = new Set([ "function", "class", "if", "else", "for", "while", "do", "switch", "case", "break", "continue", "return", "try", "catch", "finally", "throw", "throws", "public", "private", "protected", "static", "final", "abstract", "interface", "extends", "implements", "import", "export", "package", "namespace", "var", "let", "const", "new", "this", "super", "null", "undefined", "true", "false", ]); // Start with common stop words for all languages const stopWords = new Set([ ...commonStopWords, ...commonProgrammingStopWords, ]); // Add language-specific stop words switch (language) { case "javascript": case "typescript": case "jsx": case "tsx": // JavaScript/TypeScript specific [ "typeof", "instanceof", "async", "await", "yield", "void", "delete", "module", "require", "console", "log", "window", "document", "event", "prototype", "constructor", "string", "number", "boolean", "object", "array", ].forEach((word) => stopWords.add(word)); break; case "python": // Python specific [ "def", "lambda", "from", "as", "import", "with", "is", "in", "not", "and", "or", "global", "nonlocal", "pass", "yield", "assert", "del", "raise", "except", "print", "exec", "eval", "None", "True", "False", "range", "len", "self", ].forEach((word) => stopWords.add(word)); break; case "java": // Java specific [ "void", "boolean", "byte", "char", "short", "int", "long", "float", "double", "instanceof", "strictfp", "synchronized", "transient", "volatile", "native", "package", "throws", "throw", "exception", "assert", "enum", ].forEach((word) => stopWords.add(word)); break; case "csharp": case "c#": // C# specific [ "using", "namespace", "where", "select", "from", "group", "into", "orderby", "join", "equals", "out", "ref", "in", "value", "is", "as", "void", "int", "string", "bool", "decimal", "object", "char", "byte", "sbyte", "uint", "long", "ulong", "short", "ushort", "double", "float", "dynamic", "delegate", "event", "async", "await", "partial", "virtual", "override", "sealed", "base", ].forEach((word) => stopWords.add(word)); break; case "ruby": // Ruby specific [ "def", "end", "module", "require", "include", "extend", "attr", "attr_reader", "attr_writer", "attr_accessor", "lambda", "proc", "yield", "self", "nil", "true", "false", "unless", "until", "begin", "rescue", "ensure", "alias", ].forEach((word) => stopWords.add(word)); break; case "go": // Go specific [ "func", "type", "struct", "interface", "map", "chan", "go", "select", "package", "import", "const", "var", "iota", "make", "new", "append", "len", "cap", "nil", "true", "false", "int", "int8", "int16", "int32", "int64", "uint", "uint8", "uint16", "uint32", "uint64", "float32", "float64", "string", "byte", "rune", "defer", "panic", "recover", ].forEach((word) => stopWords.add(word)); break; } return stopWords; } /** * Generic tokenization for unknown languages or plaintext * * @param {string} text - The text to tokenize * @returns {string[]} An array of tokens */ function tokenizeGeneric(text) { // Replace common punctuation with spaces before splitting // But preserve meaningful symbols like _, -, #, @ if part of identifiers const withSpaces = text // Preserve common identifier patterns .replace(/([a-z0-9])[-_]([a-z0-9])/g, "$1\u0001$2") // Replace with placeholder // Add space around punctuation .replace(/[.,;:(){}[\]<>?!]/g, " $& ") // Restore preserved symbols .replace(/\u0001/g, "_"); // Split by whitespace and filter out empty tokens let tokens = withSpaces.split(/\s+/).filter((token) => token.length > 0); return tokens; } /** * JavaScript/TypeScript-specific tokenization * Handles camelCase, PascalCase, module imports, JSX tags, template literals, decorators * * @param {string} text - The JavaScript/TypeScript text to tokenize * @returns {string[]} An array of tokens */ function tokenizeJavaScript(text) { let tokens = []; // Preserve comments for content analysis but mark them specially const commentPlaceholders = {}; let commentCounter = 0; // Remove block comments first const withoutBlockComments = text.replace(/\/\*[\s\S]*?\*\//g, (match) => { const placeholder = `__COMMENT_BLOCK_${commentCounter++}__`; commentPlaceholders[placeholder] = match; return placeholder; }); // Remove line comments const withoutComments = withoutBlockComments.replace( /\/\/[^\n]*/g, (match) => { const placeholder = `__COMMENT_LINE_${commentCounter++}__`; commentPlaceholders[placeholder] = match; return placeholder; } ); // Track string literals and code blocks to avoid tokenizing their contents incorrectly const stringPlaceholders = {}; let stringCounter = 0; // Handle regex literals - important to do this before handling division operator // Look for patterns like /.../ not preceded by identifiers or closing brackets/parentheses const withoutRegex = withoutComments.replace( /(?<![a-zA-Z0-9_\)\]\}])\/(?:\\\/|[^\/\n])+\/[gimuy]*/g, (match) => { const placeholder = `__REGEX_${stringCounter++}__`; stringPlaceholders[placeholder] = match; return placeholder; } ); // Handle template literals with interpolation // Capture the whole template including expressions inside ${} const withoutTemplateLiterals = withoutRegex.replace( /`(?:\\`|\\\\|[^`])*`/g, (match) => { const placeholder = `__TEMPLATE_${stringCounter++}__`; stringPlaceholders[placeholder] = match; // Extract interpolated expressions from ${...} and tokenize them separately const expressions = []; let expContent = match.match(/\${([^}]*)}/g); if (expContent) { expContent.forEach((exp) => { expressions.push(exp.slice(2, -1)); // Remove ${ and } }); // Tokenize each expression content expressions.forEach((exp) => { const expTokens = tokenizeJavaScript(exp); // Recursively tokenize expressions tokens.push(...expTokens); }); } return placeholder; } ); // Handle string literals with placeholder const withoutStrings = withoutTemplateLiterals.replace( /'(?:\\'|\\\\|[^'])*'|"(?:\\"|\\\\|[^"])*"/g, (match) => { const placeholder = `__STRING_${stringCounter++}__`; stringPlaceholders[placeholder] = match; return placeholder; } ); // Handle JSX tags - more comprehensive approach for nested components // First, capture JSX opening tags, self-closing tags, and closing tags const withoutJSX = withoutStrings.replace( /<([A-Z][a-zA-Z0-9]*|[a-z][a-z0-9]*)((?:\s+[a-zA-Z0-9_]+(?:=(?:"|'|\{).*?(?:"|'|\}))?)*)\s*(?:\/)?>/g, (match, tagName, attributes) => { const placeholder = `__JSX_TAG_${stringCounter++}__`; stringPlaceholders[placeholder] = match; // Add the tag name as token tokens.push(tagName); // Extract and add attribute names if (attributes) { const attrMatches = attributes.match(/[a-zA-Z0-9_]+(?==)/g); if (attrMatches) { tokens.push(...attrMatches); } } return placeholder; } ); // Handle JSX closing tags const withoutJSXClosing = withoutJSX.replace( /<\/([A-Z][a-zA-Z0-9]*|[a-z][a-z0-9]*)>/g, (match, tagName) => { const placeholder = `__JSX_CLOSING_${stringCounter++}__`; stringPlaceholders[placeholder] = match; tokens.push(tagName); return placeholder; } ); // Handle decorators with placeholder - more comprehensive for complex decorators const withoutDecorators = withoutJSXClosing.replace( /@([a-zA-Z][a-zA-Z0-9_]*)(?:\((?:[^)(]*|\([^)(]*\))*\))?/g, (match, decoratorName) => { const placeholder = `__DECORATOR_${stringCounter++}__`; stringPlaceholders[placeholder] = match; // Add the decorator name as token tokens.push(decoratorName); // If there are parameters to the decorator, tokenize them separately const paramMatch = match.match(/\((.*)\)/); if (paramMatch && paramMatch[1]) { const paramTokens = tokenizeGeneric(paramMatch[1]); tokens.push(...paramTokens); } return placeholder; } ); // Handle arrow functions specially const withoutArrows = withoutDecorators.replace(/=>/g, (match) => { tokens.push("arrow_function"); // Use a special token for recognizing arrow functions return " => "; // Preserve the token but with spaces for other tokenization }); // Handle optional chaining and nullish coalescing const withSpecialOps = withoutArrows .replace(/\?\./g, (match) => { tokens.push("optional_chaining"); return " ?. "; // Space-separated for tokenization }) .replace(/\?\?/g, (match) => { tokens.push("nullish_coalescing"); return " ?? "; // Space-separated for tokenization }); // Handle import statements more robustly const withoutImports = withSpecialOps.replace( /import\s+(?:{[^}]*}|\*\s+as\s+[a-zA-Z][a-zA-Z0-9_]*|[a-zA-Z][a-zA-Z0-9_]*)\s+from\s+['"][^'"]*['"]/g, (match) => { // Add import as a token tokens.push("import"); // Extract module name const moduleMatch = match.match(/from\s+['"]([^'"]*)['"]/); if (moduleMatch && moduleMatch[1]) { tokens.push(moduleMatch[1]); } // Extract imported identifiers const importedMatch = match.match( /import\s+({[^}]*}|\*\s+as\s+[a-zA-Z][a-zA-Z0-9_]*|[a-zA-Z][a-zA-Z0-9_]*)/ ); if (importedMatch && importedMatch[1]) { const importSection = importedMatch[1]; if (importSection.startsWith("{")) { // Named imports const namedImports = importSection .replace(/[{}]/g, "") .split(",") .map((part) => part.trim()) .filter((part) => part.length > 0); tokens.push(...namedImports); } else if (importSection.includes("* as")) { // Namespace import const nsMatch = importSection.match( /\*\s+as\s+([a-zA-Z][a-zA-Z0-9_]*)/ ); if (nsMatch && nsMatch[1]) { tokens.push(nsMatch[1]); } } else { // Default import tokens.push(importSection.trim()); } } return " "; // Replace with a space } ); // Split remaining text into tokens let mainTokens = tokenizeGeneric(withoutImports); // Handle camelCase and PascalCase by splitting them into separate tokens const processedTokens = []; for (const token of mainTokens) { // Skip placeholder tokens (we'll handle them separately) if (token.startsWith("__") && token.endsWith("__")) { processedTokens.push(token); continue; } // Skip operators we've already handled if (["=>", "?.", "??"].includes(token)) { processedTokens.push(token); continue; } // Split camelCase into separate tokens const camelTokens = token .replace(/([a-z])([A-Z])/g, "$1 $2") .toLowerCase() .split(" "); // Add original token and split tokens processedTokens.push(token); if (camelTokens.length > 1) { processedTokens.push(...camelTokens); } } // Replace placeholders with their original values const finalTokens = []; for (const token of processedTokens) { if (stringPlaceholders[token]) { // Add the original placeholder as a token (to preserve context) if (token.startsWith("__REGEX_")) { finalTokens.push("regex_literal"); } else if (token.startsWith("__JSX_")) { finalTokens.push("jsx_element"); } else if (token.startsWith("__DECORATOR_")) { finalTokens.push("decorator"); } else { finalTokens.push(token); } // For string literals, also add their content as tokens if (token.startsWith("__STRING_") || token.startsWith("__TEMPLATE_")) { // Extract content and add relevant words const content = stringPlaceholders[token]; // Remove quotes/backticks and tokenize content const strContent = content.replace(/^[`'"](.*)[`'"]$/s, "$1"); const contentTokens = tokenizeGeneric(strContent); finalTokens.push(...contentTokens); } } else if (commentPlaceholders[token]) { // For comments, optionally extract keywords if needed // Don't add the full comment as a token to avoid noise finalTokens.push("code_comment"); // Extract possible important terms from comments const commentContent = commentPlaceholders[token] .replace(/^\/\*|\*\/$/g, "") // Remove /* */ .replace(/^\/\//g, ""); // Remove // // Only use alphanumeric words from comments, skip punctuation and symbols const commentTokens = commentContent .split(/\s+/) .filter((word) => /^[a-z0-9_]{3,}$/i.test(word)) .map((word) => word.toLowerCase()); finalTokens.push(...commentTokens); } else { finalTokens.push(token); } } return [...new Set(finalTokens)]; // Remove duplicates } /** * Python-specific tokenization * Handles snake_case, decorators, f-strings, indentation significance * * @param {string} text - The Python text to tokenize * @returns {string[]} An array of tokens */ function tokenizePython(text) { let tokens = []; // Preserve comments for content analysis but mark them specially const commentPlaceholders = {}; let commentCounter = 0; // Remove block comments first (triple-quoted strings when used as comments) const withoutDocstrings = text.replace( /(?:'''[\s\S]*?'''|"""[\s\S]*?""")/g, (match) => { const placeholder = `__PYCOMMENT_BLOCK_${commentCounter++}__`; commentPlaceholders[placeholder] = match; return placeholder; } ); // Remove line comments const withoutComments = withoutDocstrings.replace(/#[^\n]*/g, (match) => { const placeholder = `__PYCOMMENT_LINE_${commentCounter++}__`; commentPlaceholders[placeholder] = match; return placeholder; }); // Handle string literals const stringPlaceholders = {}; let placeholderCounter = 0; // Enhanced f-string handling - look for f, fr, rf prefixes and capture interpolation const withoutFStrings = withoutComments.replace( /(?:f|fr|rf)(?:'''[\s\S]*?'''|"""[\s\S]*?"""|'(?:\\'|\\\\|[^'])*'|"(?:\\"|\\\\|[^"])*")/g, (match) => { const placeholder = `__PYFSTRING_${placeholderCounter++}__`; stringPlaceholders[placeholder] = match; // Extract interpolated expressions from {...} and tokenize them separately const expressions = []; // Match {expression} but not escaped \{ let expContent = match.match(/(?<!\\){([^{}]*)}/g); if (expContent) { expContent.forEach((exp) => { expressions.push(exp.slice(1, -1)); // Remove { and } }); // Tokenize each expression content expressions.forEach((exp) => { const expTokens = tokenizePython(exp); // Recursively tokenize expressions tokens.push(...expTokens); }); } return placeholder; } ); // Handle other string literals (r-strings, normal strings) const withoutSpecialStrings = withoutFStrings.replace( /(?:r|b|rb|br)?(?:'''[\s\S]*?'''|"""[\s\S]*?"""|'(?:\\'|\\\\|[^'])*'|"(?:\\"|\\\\|[^"])*")/g, (match) => { const placeholder = `__PYSTRING_${placeholderCounter++}__`; stringPlaceholders[placeholder] = match; return placeholder; } ); // Handle decorators with placeholder - more comprehensive for complex decorators const withoutDecorators = withoutSpecialStrings.replace( /@([a-zA-Z][a-zA-Z0-9_.]*)(?:\((?:[^)(]*|\([^)(]*\))*\))?/g, (match, decoratorName) => { const placeholder = `__PYDECORATOR_${placeholderCounter++}__`; stringPlaceholders[placeholder] = match; // Add decorator name as token tokens.push(decoratorName); // If the decorator has parameters, extract and tokenize them const paramMatch = match.match(/\((.*)\)/); if (paramMatch && paramMatch[1]) { const paramTokens = tokenizeGeneric(paramMatch[1]); tokens.push(...paramTokens); } return placeholder; } ); // Handle Python-specific operators const withSpecialOps = withoutDecorators // Handle walrus operator := .replace(/:=/g, (match) => { tokens.push("walrus_operator"); return " := "; // Space-separated for tokenization }) // Handle list splices with : .replace(/\[.*:.*\]/g, (match) => { tokens.push("slice_operation"); // Process what's inside the brackets const innerContent = match.slice(1, -1); const sliceParts = innerContent.split(":"); sliceParts.forEach((part) => { if (part.trim()) { const partTokens = tokenizeGeneric(part.trim()); tokens.push(...partTokens); } }); return match; // Preserve for general tokenization }); // Process lines with indentation awareness const lines = withSpecialOps.split("\n"); // Track indentation levels let previousIndentLevel = 0; for (const line of lines) { // Skip empty lines if (line.trim() === "") continue; // Count leading spaces/tabs to track indentation const indentMatch = line.match(/^(\s*)/); const leadingSpaces = indentMatch ? indentMatch[1].length : 0; if (leadingSpaces !== previousIndentLevel) { if (leadingSpaces > previousIndentLevel) { // Indentation increased - add token for indent tokens.push("indent"); } else { // Indentation decreased - add token for dedent // Add one dedent token for each level decreased const dedentLevels = Math.floor( (previousIndentLevel - leadingSpaces) / 4 ); for (let i = 0; i < dedentLevels; i++) { tokens.push("dedent"); } } previousIndentLevel = leadingSpaces; } // Tokenize the line content by first removing the leading whitespace const lineContent = line.trim(); if (lineContent) { // Check for keyword tokens const pythonKeywords = [ "def", "class", "lambda", "return", "yield", "from", "import", "as", "with", "try", "except", "finally", "raise", "assert", "if", "elif", "else", "while", "for", "in", "continue", "break", "pass", "global", "nonlocal", "del", "is", "not", "and", "or", "async", "await", "comprehension", "self", ]; // Add line content keywords for (const keyword of pythonKeywords) { if (lineContent.includes(keyword)) { const keywordRegex = new RegExp(`\\b${keyword}\\b`, "g"); if (keywordRegex.test(lineContent)) { tokens.push(keyword); } } } // Now tokenize the whole line const lineTokens = tokenizeGeneric(lineContent); tokens.push(...lineTokens); } } // Add keyword for Python-specific list operations if ( withSpecialOps.includes("append(") || withSpecialOps.includes(".extend(") ) { tokens.push("list_operation"); } // Add keyword for Python-specific dictionary operations if ( withSpecialOps.includes(".get(") || withSpecialOps.includes(".items()") || withSpecialOps.includes(".keys()") || withSpecialOps.includes(".values()") ) { tokens.push("dict_operation"); } // Split snake_case identifiers const snakeCaseTokens = []; for (const token of tokens) { // Skip placeholder tokens if (token.startsWith("__") && token.endsWith("__")) { snakeCaseTokens.push(token); continue; } // Split snake_case if (token.includes("_")) { const parts = token.split("_").filter((part) => part.length > 0); snakeCaseTokens.push(token); // Original token snakeCaseTokens.push(...parts); // Parts of the token } else { snakeCaseTokens.push(token); } } // Replace placeholders with their original values and process them const finalTokens = []; for (const token of snakeCaseTokens) { if (stringPlaceholders[token]) { if (token.startsWith("__PYFSTRING_")) { finalTokens.push("f_string"); } else if (token.startsWith("__PYSTRING_")) { finalTokens.push("string_literal"); } else if (token.startsWith("__PYDECORATOR_")) { finalTokens.push("decorator"); } else { finalTokens.push(token); } // For string placeholders, also tokenize their content if (token.startsWith("__PYSTRING_") || token.startsWith("__PYFSTRING_")) { const content = stringPlaceholders[token]; // Extract the content without prefix and quotes let strContent = content; // Handle different types of string literals if ( strContent.startsWith("f") || strContent.startsWith("r") || strContent.startsWith("fr") || strContent.startsWith("rf") || strContent.startsWith("b") || strContent.startsWith("rb") || strContent.startsWith("br") ) { const prefixLength = /^[a-z]+/.exec(strContent)[0].length; strContent = strContent.substring(prefixLength); } // Remove quotes strContent = strContent.replace(/^['"]|['"]$/g, ""); strContent = strContent.replace(/^'''|'''$/g, ""); strContent = strContent.replace(/^"""|"""$/g, ""); // Remove f-string interpolation markers strContent = strContent.replace(/{[^{}]*}/g, " "); // Tokenize content const contentTokens = tokenizeGeneric(strContent); finalTokens.push(...contentTokens); } } else if (commentPlaceholders[token]) { // Extract useful keywords from comments finalTokens.push("code_comment"); // Extract possible important terms from comments const commentContent = commentPlaceholders[token] .replace(/^#{1}/, "") // Remove # .replace(/^'''|'''$/g, "") // Remove ''' .replace(/^"""|"""$/g, ""); // Remove """ // Only use alphanumeric words from comments, skip punctuation and symbols const commentTokens = commentContent .split(/\s+/) .filter((word) => /^[a-z0-9_]{3,}$/i.test(word)) .map((word) => word.toLowerCase()); finalTokens.push(...commentTokens); } else { finalTokens.push(token); } } return [...new Set(finalTokens)]; // Remove duplicates } /** * Java/C#-like language tokenization * Handles annotations, generics, access modifiers, lambda expressions * * @param {string} text - The Java or C# text to tokenize * @returns {string[]} An array of tokens */ function tokenizeJavaLike(text) { let tokens = []; // Preserve comments for content analysis but mark them specially const commentPlaceholders = {}; let commentCounter = 0; // Remove block comments first const withoutBlockComments = text.replace(/\/\*[\s\S]*?\*\//g, (match) => { const placeholder = `__JAVA_COMMENT_BLOCK_${commentCounter++}__`; commentPlaceholders[placeholder] = match; return placeholder; }); // Remove line comments const withoutComments = withoutBlockComments.replace( /\/\/[^\n]*/g, (match) => { const placeholder = `__JAVA_COMMENT_LINE_${commentCounter++}__`; commentPlaceholders[placeholder] = match; return placeholder; } ); // Handle string literals with placeholders const stringPlaceholders = {}; let placeholderCounter = 0; // Handle string literals (support escaping) const withoutStrings = withoutComments.replace( /"(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*'/g, (match) => { const placeholder = `__JAVASTRING_${placeholderCounter++}__`; stringPlaceholders[placeholder] = match; return placeholder; } ); // Handle annotations with parameters more comprehensively const withoutAnnotations = withoutStrings.replace( /@([a-zA-Z][a-zA-Z0-9_.]*)(?:\s*\((?:[^)(]*|\([^)(]*\))*\))?/g, (match, annotationName) => { const placeholder = `__ANNOTATION_${placeholderCounter++}__`; stringPlaceholders[placeholder] = match; // Add annotation name as a token tokens.push("annotation"); tokens.push(annotationName.toLowerCase()); // Extract and process annotation parameters const paramMatch = match.match(/\((.*)\)/); if (paramMatch && paramMatch[1]) { const params = paramMatch[1]; // Handle key-value pairs in annotations const keyValuePairs = params.split(","); for (const pair of keyValuePairs) { const parts = pair.split("="); if (parts.length === 2) { // Add parameter name as token tokens.push(parts[0].trim()); } // Tokenize the values const valueTokens = tokenizeGeneric(pair); tokens.push(...valueTokens); } } return placeholder; } ); // Handle generics with better nesting support // This pattern can handle nested generics like Map<String, List<Integer>> const withoutGenerics = withoutAnnotations.replace( /<([^<>]*(?:<[^<>]*(?:<[^<>]*>)*[^<>]*>)*[^<>]*)>/g, (match) => { const placeholder = `__GENERIC_${placeholderCounter++}__`; stringPlaceholders[placeholder] = match; // Add token for generic type usage tokens.push("generic_type"); // Process the content within the generic // Remove the < and > delimiters const content = match.slice(1, -1); // Split by commas to get individual type parameters const typeParams = content.split(/,(?![^<>]*>)/); // Split by commas not within angle brackets // Process each type parameter for (const param of typeParams) { const paramTokens = tokenizeGeneric(param.trim()); tokens.push(...paramTokens); } return placeholder; } ); // Handle lambda expressions (Java: -> and C#: =>) const withoutLambdas = withoutGenerics.replace( /(?:\(.*?\)|[a-zA-Z_][a-zA-Z0-9_]*)\s*(?:->|=>)\s*(?:{[\s\S]*?}|[^;]*)/g, (match) => { const placeholder = `__LAMBDA_${placeholderCounter++}__`; stringPlaceholders[placeholder] = match; // Add token for lambda expression tokens.push("lambda_expression"); // Tokenize the entire lambda expression to extract parameter and body tokens const lambdaTokens = tokenizeGeneric(match); tokens.push(...lambdaTokens); return placeholder; } ); // Extract and add access modifiers as specific tokens const accessModifiers = [ "public", "private", "protected", "internal", "static", "final", "abstract", "override", "virtual", "readonly", "const", "sealed", "partial", "async", "volatile", "transient", "synchronized", "unsafe", "extern", ]; let withAccessModifiers = withoutLambdas; for (const modifier of accessModifiers) { // Use word boundaries to match whole words const regex = new RegExp(`\\b${modifier}\\b`, "gi"); withAccessModifiers = withAccessModifiers.replace(regex, (match) => { tokens.push(match.toLowerCase()); tokens.push("access_modifier"); return match; }); } // Handle package/namespace declarations withAccessModifiers = withAccessModifiers.replace( /\b(?:package|namespace)\s+([a-zA-Z_][a-zA-Z0-9_.]*)/g, (match, packageName) => { tokens.push("package_declaration"); // Add the package name and its components const packageParts = packageName.split("."); tokens.push(packageName); tokens.push(...packageParts); return match; } ); // Handle import/using statements withAccessModifiers = withAccessModifiers.replace( /\b(?:import|using)\s+(?:static\s+)?([a-zA-Z_][a-zA-Z0-9_.]*(?:\.\*)?)/g, (match, importName) => { tokens.push("import_statement"); // Add the import name and its components const importParts = importName.split("."); tokens.push(importName); // Remove wildcard * from last part if present if ( importParts.length > 0 && importParts[importParts.length - 1] === "*" ) { importParts.pop(); tokens.push("wildcard_import"); } tokens.push(...importParts); return match; } ); // Handle common C# LINQ expressions if (/\bfrom\b.*\bin\b.*\bselect\b/i.test(withAccessModifiers)) { tokens.push("linq_expression"); // Extract common LINQ keywords const linqKeywords = [ "from", "in", "select", "where", "group", "by", "into", "orderby", "join", "let", "on", "equals", ]; for (const keyword of linqKeywords) { const regex = new RegExp(`\\b${keyword}\\b`, "gi"); if (regex.test(withAccessModifiers)) { tokens.push(`linq_${keyword}`); } } } // Add remaining tokens const mainTokens = tokenizeGeneric(withAccessModifiers); tokens.push(...mainTokens); // Handle camelCase and PascalCase with more specialized type names const processedTokens = []; for (const token of tokens) { // Skip placeholder tokens if (token.startsWith("__") && token.endsWith("__")) { processedTokens.push(token); continue; } // Check if token might be a fully qualified name (contains dots) if (token.includes(".")) { const parts = token.split("."); processedTokens.push(token); // Add the full token processedTokens.push(...parts); // Add individual parts continue; } // Split PascalCase and camelCase // Add original token first processedTokens.push(token); // Then add split tokens if there's a case change if (/[a-z][A-Z]/.test(token)) { const parts = token .replace(/([a-z])([A-Z])/g, "$1 $2") .toLowerCase() .split(" "); if (parts.length > 1) { processedTokens.push(...parts); } } } // Replace placeholders with their original values const finalTokens = []; for (const token of processedTokens) { if (stringPlaceholders[token]) { // Add information about what kind of structure this is if (token.startsWith("__JAVASTRING_")) { finalTokens.push("string_literal"); } else if (token.startsWith("__ANNOTATION_")) { finalTokens.push("annotation"); } else if (token.startsWith("__GENERIC_")) { finalTokens.push("generic"); } else if (token.startsWith("__LAMBDA_")) { finalTokens.push("lambda"); } else { finalTokens.push(token); } // For string literals, add their content as tokens if (token.startsWith("__JAVASTRING_")) { const content = stringPlaceholders[token]; // Extract the content without the quotes const strContent = content.replace(/^"|"$/g, "").replace(/^'|'$/g, ""); // Only tokenize non-empty content if (strContent.trim().length > 0) { const contentTokens = tokenizeGeneric(strContent); finalTokens.push(...contentTokens); } } } else if (commentPlaceholders[token]) { // Extract useful keywords from comments finalTokens.push("code_comment"); // Extract possible important terms from comments const commentContent = commentPlaceholders[token] .replac