UNPKG

code-chopper

Version:

A library for semantically dividing code written in various programming languages

589 lines (579 loc) 18.2 kB
"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var index_exports = {}; __export(index_exports, { createParserFactory: () => createParserFactory, parseCodeAndChunk: () => parseCodeAndChunk, readDirectoryAndChunk: () => readDirectoryAndChunk, readFileAndChunk: () => readFileAndChunk }); module.exports = __toCommonJS(index_exports); // src/chunking/parser-factory.ts var import_tree_sitter = __toESM(require("tree-sitter"), 1); // src/chunking/file-extensions.ts var SUPPORTED_LANGUAGES = [ "javascript", "typescript", "python", "go", "rust", "java", "ruby", "c", "cpp", "html", "css", "bash" ]; var LANGUAGE_PARSERS = /* @__PURE__ */ new Map([ // JavaScript/TypeScript ecosystem [".js", "javascript"], [".jsx", "javascript"], [".ts", "typescript"], [".tsx", "typescript"], [".mjs", "javascript"], [".mts", "typescript"], [".cjs", "javascript"], // Python [".py", "python"], // Go [".go", "go"], // Rust [".rs", "rust"], // Java [".java", "java"], // Ruby [".rb", "ruby"], // C/C++ [".c", "c"], [".cpp", "cpp"], [".h", "c"], // Web [".html", "html"], [".css", "css"], [".scss", "css"], [".sass", "css"], // Shell [".sh", "bash"], [".bash", "bash"] ]); var TREE_SITTER_SUPPORTED = new Set( Array.from(LANGUAGE_PARSERS.keys()) ); function isSupportedLanguage(language) { return SUPPORTED_LANGUAGES.includes(language); } function getLanguageFromExtension(ext) { return LANGUAGE_PARSERS.get(ext.toLowerCase()); } // src/chunking/parser-factory.ts var import_node_process = require("process"); var createLanguageLoader = () => async (language) => { try { switch (language) { case "javascript": case "typescript": const ts = await import("tree-sitter-typescript"); return ts.typescript ?? ts.default?.typescript ?? ts.default; case "python": return (await import("tree-sitter-python")).default; case "go": return (await import("tree-sitter-go")).default; case "rust": return (await import("tree-sitter-rust")).default; case "java": return (await import("tree-sitter-java")).default; case "ruby": return (await import("tree-sitter-ruby")).default; case "c": return (await import("tree-sitter-c")).default; case "cpp": return (await import("tree-sitter-cpp")).default; case "html": return (await import("tree-sitter-html")).default; case "css": return (await import("tree-sitter-css")).default; case "bash": return (await import("tree-sitter-bash")).default; default: return null; } } catch { console.log("Failed to load language module for ", language); return null; } }; var createParserFactory = () => { const parsers = /* @__PURE__ */ new Map(); const loader = createLanguageLoader(); const createParser = async (language) => { if (!isSupportedLanguage(language)) { console.warn(`${language} is not supported.`); return null; } if (!parsers.has(language) && loader) { const languageModule = await loader(language); if (languageModule) { const parser = new import_tree_sitter.default(); try { parser.setLanguage(languageModule); parsers.set(language, parser); } catch (e) { console.log("Loading Module:", languageModule.name); (0, import_node_process.exit)(); } } } return parsers.get(language) || null; }; const dispose = () => { parsers.clear(); }; return { createParser, dispose }; }; // src/io/file-operations.ts var import_promises = __toESM(require("fs/promises"), 1); var import_node_path2 = __toESM(require("path"), 1); // src/chunking/cst-operations.ts var import_node_path = __toESM(require("path"), 1); // src/chunking/language-node-types.ts var LANGUAGE_NODE_TYPES = { javascript: { functions: [ "function_declaration", "function_expression" // "arrow_function", ], classes: ["class_declaration"], methods: ["method_definition"], imports: ["import_statement"], variables: ["variable_declaration", "lexical_declaration"] }, typescript: { functions: [ "function_declaration", "function_expression" // "arrow_function", ], classes: ["class_declaration"], methods: ["method_definition"], interfaces: ["interface_declaration"], types: ["type_alias_declaration"], imports: ["import_statement"], variables: ["variable_declaration", "lexical_declaration", "public_field_definition"] }, python: { functions: ["function_definition"], classes: ["class_definition"], methods: ["function_definition"], // Methods within classes are also function_definition imports: ["import_statement", "import_from_statement"], variables: ["assignment"] // Variable assignment in Python }, go: { functions: ["function_declaration"], methods: ["method_declaration"], types: ["type_declaration"], imports: ["import_declaration"], variables: [ "var_spec", "const_spec", "short_var_declaration" ] }, rust: { functions: ["function_item"], structs: ["struct_item"], impls: ["impl_item"], traits: ["trait_item"], imports: ["use_declaration"], variables: ["let_declaration"] }, java: { functions: ["method_declaration"], classes: ["class_declaration"], interfaces: ["interface_declaration"], imports: ["import_declaration"], variables: ["local_variable_declaration"] }, ruby: { functions: ["method"], classes: ["class"], modules: ["module"], imports: ["require", "load"], variables: ["assignment"] }, c: { functions: ["function_definition"], structs: ["struct_specifier"], enums: ["enum_specifier"], typedefs: ["type_definition"], includes: ["preproc_include"], variables: ["declaration"] }, cpp: { functions: ["function_definition"], classes: ["class_specifier"], structs: ["struct_specifier"], namespaces: ["namespace_definition"], templates: ["template_declaration"], includes: ["preproc_include"], variables: ["declaration"] }, html: { elements: ["element"], scripts: ["script_element"], styles: ["style_element"] }, css: { rules: ["rule_set"], media: ["media_statement"], keyframes: ["keyframes_statement"], imports: ["import_statement"] }, bash: { functions: ["function_definition"], commands: ["command"], variables: ["variable_assignment"] } }; var createBoundaryNodeTypes = (language) => { const nodeTypes = /* @__PURE__ */ new Set(); const langConfig = LANGUAGE_NODE_TYPES[language]; if (!langConfig) { const defaultConfig = LANGUAGE_NODE_TYPES.typescript; Object.values(defaultConfig).flat().forEach((type) => nodeTypes.add(type)); return nodeTypes; } Object.values(langConfig).flat().forEach((type) => nodeTypes.add(type)); return nodeTypes; }; var createNodeNameExtractor = (language) => { return (node) => { const nameField = node.childForFieldName?.("name"); if (nameField?.text) { return nameField.text; } switch (language) { case "javascript": case "typescript": if (node.type === "arrow_function") { const parent = node.parent; if (parent) { const idNode = parent.childForFieldName("name"); if (idNode?.text) { return idNode.text; } } } if (node.type === "variable_declaration" || node.type === "lexical_declaration") { const child = node.children.find((c) => c.type === "variable_declarator"); if (child) { const idNode = child.childForFieldName("name"); if (idNode?.text) { return idNode.text; } } } if (node.type === "method_definition") { const keyNode = node.childForFieldName("key"); if (keyNode?.text) { return keyNode.text; } } break; case "python": break; case "go": if (node.type === "method_declaration") { const nameNode = node.childForFieldName("name"); if (nameNode?.text) { return nameNode.text; } } let target = node; while (target.children.length > 0) { const nameCandidate = target.children.filter((c) => c.type === "identifier"); if (nameCandidate.length < 1) { if (target.firstChild) { target = target.firstChild; } else { break; } } else { return nameCandidate.at(0)?.text; } } break; case "rust": if (node.type === "function_item") { const nameNode = node.childForFieldName("name"); if (nameNode?.text) { return nameNode.text; } } break; case "java": if (node.type === "method_declaration") { const nameNode = node.childForFieldName("name"); if (nameNode?.text) { return nameNode.text; } } break; } const identifierChild = node.children?.find?.( (child) => child.type === "identifier" ); return identifierChild?.text; }; }; var createDocsExtracor = (language) => { const extractOuterDocComment = (node) => { let doc_candidate = node.previousSibling; switch (language) { case "javascript": case "typescript": if (node.parent?.type === "export_statement") { doc_candidate = node.parent.previousSibling; } break; case "python": case "go": case "rust": case "java": case "ruby": case "c": case "cpp": case "html": case "css": case "bash": break; } if (doc_candidate && doc_candidate.type.includes("comment")) { return { hasDocs: true, detail: { text: doc_candidate.text, startIndex: doc_candidate.startIndex, endIndex: doc_candidate.endIndex } }; } return { hasDocs: false }; }; const extractPyDocComment = (node) => { const doc_candidate = node.lastChild?.firstChild?.firstChild; if (doc_candidate && doc_candidate.type === "string") { return { hasDocs: true, detail: { text: doc_candidate.text, startIndex: doc_candidate.startIndex, endIndex: doc_candidate.endIndex } }; } return { hasDocs: false }; }; return (node) => { switch (language) { case "javascript": case "typescript": case "rust": case "java": case "ruby": case "c": case "cpp": case "go": return extractOuterDocComment(node); case "python": return extractPyDocComment(node); case "html": case "css": case "bash": return { hasDocs: false }; } }; }; // src/chunking/cst-operations.ts var createNodeTraverser = (language) => { const boundaryNodeTypes = createBoundaryNodeTypes(language); const extractName = createNodeNameExtractor(language); const extractDocs = createDocsExtracor(language); const isBoundary = (nodeType) => boundaryNodeTypes.has(nodeType); const traverse = (node, filter) => { const boundaries = []; const visit = (node2, parentInfo) => { const docs = extractDocs(node2); const name = extractName(node2); if (isBoundary(node2.type) && filter(language, node2)) { boundaries.push({ type: node2.type, parentInfo, name, startIndex: docs.hasDocs ? docs.detail.startIndex : node2.startIndex, endIndex: node2.endIndex, text: node2.text, docsText: docs.hasDocs ? docs.detail.text : "" }); parentInfo = name ? [...parentInfo, name] : parentInfo; } for (const child of node2.children) { visit(child, parentInfo); } }; visit(node, []); return boundaries; }; return { traverse }; }; var createCSTOperations = (factory) => { const parseAndExtractBoundaries = async (code, language, options) => { const parser = await factory.createParser(language); if (!parser) { throw new Error(`No parser available for language: ${language}`); } const traverser = createNodeTraverser(language); const tree = parser.parse(code); return traverser.traverse(tree.rootNode, options.filter ?? (() => true)); }; const boundariesToChunks = (boundaries) => { return boundaries.map((boundary) => ({ content: boundary.text, startOffset: boundary.startIndex, endOffset: boundary.endIndex, // Because the actual value is inserted using the I/O functions defined in io/file-operations.ts, the result is an empty string. filePath: "", boundary: { type: boundary.type, name: boundary.name, parent: boundary.parentInfo, docs: boundary.docsText } })); }; return { parseAndExtractBoundaries, boundariesToChunks }; }; var withCSTParsing = async (factory, operation) => { const ops = createCSTOperations(factory); try { return await operation(ops); } finally { } }; var createCSTChunkingOperations = () => { const chunkWithCST = async (code, language, _options, factory) => { return withCSTParsing(factory, async (ops) => { const boundaries = await ops.parseAndExtractBoundaries(code, language, _options); return ops.boundariesToChunks(boundaries); }); }; const chunkWithFallback = async (code, filePath, options, fallback) => { const ext = import_node_path.default.extname(filePath); const language = getLanguageFromExtension(ext); if (!language) { return fallback(code, "unknown", options); } const factory = createParserFactory(); try { return await chunkWithCST(code, language, options, factory); } catch (error) { return fallback(code, language, options); } finally { factory.dispose(); } }; return { chunkWithCST, chunkWithFallback }; }; // src/io/file-operations.ts var isSupportedLanguageExtension = (filename) => /\.(ts|js|tsx|jsx|py|java|cpp|c|h|cs|go|rb|php|go)/.test(filename); var readFileAndChunk = async (factory, options, baseDirPath, relativeFilePath) => { const code = await import_promises.default.readFile(import_node_path2.default.join(baseDirPath, relativeFilePath), "utf8"); const ext = import_node_path2.default.extname(relativeFilePath); const language = getLanguageFromExtension(ext); if (!language) { return []; } const chunks = await parseCodeAndChunk(code, language, factory, options); return chunks.map((c) => ({ ...c, filePath: relativeFilePath })); }; var readDirectoryAndChunk = async (factory, options, baseDirPath) => _readDirectoryAndChunkRecursive(factory, options, baseDirPath, ""); var _readDirectoryAndChunkRecursive = async (factory, options, baseDirPath, relativePath = "") => { const currentPath = import_node_path2.default.join(baseDirPath, relativePath); const entries = await import_promises.default.readdir(currentPath, { withFileTypes: true }); const excludeDirs = options.excludeDirs ?? [/node_modules/, /^\.[A-Za-z0-9_-]+$/]; const promises = entries.map((entry) => { const newRelativePath = import_node_path2.default.join(relativePath, entry.name); if (entry.isDirectory() && excludeDirs.every((e) => !e.test(entry.name))) { return _readDirectoryAndChunkRecursive(factory, options, baseDirPath, newRelativePath); } if (isSupportedLanguageExtension(entry.name)) { try { return readFileAndChunk(factory, options, baseDirPath, newRelativePath); } catch (error) { return []; } } return []; }); const nestedResults = await Promise.all(promises); return nestedResults.flat(); }; var parseCodeAndChunk = (code, language, factory, options) => { const cstOperations = createCSTChunkingOperations(); return cstOperations.chunkWithCST(code, language, options, factory); }; // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { createParserFactory, parseCodeAndChunk, readDirectoryAndChunk, readFileAndChunk }); /*! * Copyright (c) ushirononeko 2025 * Copyright (c) sirasagi62 2025 * Published under MIT License * see https://opensource.org/licenses/MIT * * This code was originally created by ushirononeko and modified by sirasagi62 * Original: https://github.com/ushironoko/gistdex */