code-chopper
Version:
A library for semantically dividing code written in various programming languages
589 lines (579 loc) • 18.2 kB
JavaScript
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var index_exports = {};
__export(index_exports, {
createParserFactory: () => createParserFactory,
parseCodeAndChunk: () => parseCodeAndChunk,
readDirectoryAndChunk: () => readDirectoryAndChunk,
readFileAndChunk: () => readFileAndChunk
});
module.exports = __toCommonJS(index_exports);
// src/chunking/parser-factory.ts
var import_tree_sitter = __toESM(require("tree-sitter"), 1);
// src/chunking/file-extensions.ts
var SUPPORTED_LANGUAGES = [
"javascript",
"typescript",
"python",
"go",
"rust",
"java",
"ruby",
"c",
"cpp",
"html",
"css",
"bash"
];
var LANGUAGE_PARSERS = /* @__PURE__ */ new Map([
// JavaScript/TypeScript ecosystem
[".js", "javascript"],
[".jsx", "javascript"],
[".ts", "typescript"],
[".tsx", "typescript"],
[".mjs", "javascript"],
[".mts", "typescript"],
[".cjs", "javascript"],
// Python
[".py", "python"],
// Go
[".go", "go"],
// Rust
[".rs", "rust"],
// Java
[".java", "java"],
// Ruby
[".rb", "ruby"],
// C/C++
[".c", "c"],
[".cpp", "cpp"],
[".h", "c"],
// Web
[".html", "html"],
[".css", "css"],
[".scss", "css"],
[".sass", "css"],
// Shell
[".sh", "bash"],
[".bash", "bash"]
]);
var TREE_SITTER_SUPPORTED = new Set(
Array.from(LANGUAGE_PARSERS.keys())
);
function isSupportedLanguage(language) {
return SUPPORTED_LANGUAGES.includes(language);
}
function getLanguageFromExtension(ext) {
return LANGUAGE_PARSERS.get(ext.toLowerCase());
}
// src/chunking/parser-factory.ts
var import_node_process = require("process");
var createLanguageLoader = () => async (language) => {
try {
switch (language) {
case "javascript":
case "typescript":
const ts = await import("tree-sitter-typescript");
return ts.typescript ?? ts.default?.typescript ?? ts.default;
case "python":
return (await import("tree-sitter-python")).default;
case "go":
return (await import("tree-sitter-go")).default;
case "rust":
return (await import("tree-sitter-rust")).default;
case "java":
return (await import("tree-sitter-java")).default;
case "ruby":
return (await import("tree-sitter-ruby")).default;
case "c":
return (await import("tree-sitter-c")).default;
case "cpp":
return (await import("tree-sitter-cpp")).default;
case "html":
return (await import("tree-sitter-html")).default;
case "css":
return (await import("tree-sitter-css")).default;
case "bash":
return (await import("tree-sitter-bash")).default;
default:
return null;
}
} catch {
console.log("Failed to load language module for ", language);
return null;
}
};
var createParserFactory = () => {
const parsers = /* @__PURE__ */ new Map();
const loader = createLanguageLoader();
const createParser = async (language) => {
if (!isSupportedLanguage(language)) {
console.warn(`${language} is not supported.`);
return null;
}
if (!parsers.has(language) && loader) {
const languageModule = await loader(language);
if (languageModule) {
const parser = new import_tree_sitter.default();
try {
parser.setLanguage(languageModule);
parsers.set(language, parser);
} catch (e) {
console.log("Loading Module:", languageModule.name);
(0, import_node_process.exit)();
}
}
}
return parsers.get(language) || null;
};
const dispose = () => {
parsers.clear();
};
return { createParser, dispose };
};
// src/io/file-operations.ts
var import_promises = __toESM(require("fs/promises"), 1);
var import_node_path2 = __toESM(require("path"), 1);
// src/chunking/cst-operations.ts
var import_node_path = __toESM(require("path"), 1);
// src/chunking/language-node-types.ts
var LANGUAGE_NODE_TYPES = {
javascript: {
functions: [
"function_declaration",
"function_expression"
// "arrow_function",
],
classes: ["class_declaration"],
methods: ["method_definition"],
imports: ["import_statement"],
variables: ["variable_declaration", "lexical_declaration"]
},
typescript: {
functions: [
"function_declaration",
"function_expression"
// "arrow_function",
],
classes: ["class_declaration"],
methods: ["method_definition"],
interfaces: ["interface_declaration"],
types: ["type_alias_declaration"],
imports: ["import_statement"],
variables: ["variable_declaration", "lexical_declaration", "public_field_definition"]
},
python: {
functions: ["function_definition"],
classes: ["class_definition"],
methods: ["function_definition"],
// Methods within classes are also function_definition
imports: ["import_statement", "import_from_statement"],
variables: ["assignment"]
// Variable assignment in Python
},
go: {
functions: ["function_declaration"],
methods: ["method_declaration"],
types: ["type_declaration"],
imports: ["import_declaration"],
variables: [
"var_spec",
"const_spec",
"short_var_declaration"
]
},
rust: {
functions: ["function_item"],
structs: ["struct_item"],
impls: ["impl_item"],
traits: ["trait_item"],
imports: ["use_declaration"],
variables: ["let_declaration"]
},
java: {
functions: ["method_declaration"],
classes: ["class_declaration"],
interfaces: ["interface_declaration"],
imports: ["import_declaration"],
variables: ["local_variable_declaration"]
},
ruby: {
functions: ["method"],
classes: ["class"],
modules: ["module"],
imports: ["require", "load"],
variables: ["assignment"]
},
c: {
functions: ["function_definition"],
structs: ["struct_specifier"],
enums: ["enum_specifier"],
typedefs: ["type_definition"],
includes: ["preproc_include"],
variables: ["declaration"]
},
cpp: {
functions: ["function_definition"],
classes: ["class_specifier"],
structs: ["struct_specifier"],
namespaces: ["namespace_definition"],
templates: ["template_declaration"],
includes: ["preproc_include"],
variables: ["declaration"]
},
html: {
elements: ["element"],
scripts: ["script_element"],
styles: ["style_element"]
},
css: {
rules: ["rule_set"],
media: ["media_statement"],
keyframes: ["keyframes_statement"],
imports: ["import_statement"]
},
bash: {
functions: ["function_definition"],
commands: ["command"],
variables: ["variable_assignment"]
}
};
var createBoundaryNodeTypes = (language) => {
const nodeTypes = /* @__PURE__ */ new Set();
const langConfig = LANGUAGE_NODE_TYPES[language];
if (!langConfig) {
const defaultConfig = LANGUAGE_NODE_TYPES.typescript;
Object.values(defaultConfig).flat().forEach((type) => nodeTypes.add(type));
return nodeTypes;
}
Object.values(langConfig).flat().forEach((type) => nodeTypes.add(type));
return nodeTypes;
};
var createNodeNameExtractor = (language) => {
return (node) => {
const nameField = node.childForFieldName?.("name");
if (nameField?.text) {
return nameField.text;
}
switch (language) {
case "javascript":
case "typescript":
if (node.type === "arrow_function") {
const parent = node.parent;
if (parent) {
const idNode = parent.childForFieldName("name");
if (idNode?.text) {
return idNode.text;
}
}
}
if (node.type === "variable_declaration" || node.type === "lexical_declaration") {
const child = node.children.find((c) => c.type === "variable_declarator");
if (child) {
const idNode = child.childForFieldName("name");
if (idNode?.text) {
return idNode.text;
}
}
}
if (node.type === "method_definition") {
const keyNode = node.childForFieldName("key");
if (keyNode?.text) {
return keyNode.text;
}
}
break;
case "python":
break;
case "go":
if (node.type === "method_declaration") {
const nameNode = node.childForFieldName("name");
if (nameNode?.text) {
return nameNode.text;
}
}
let target = node;
while (target.children.length > 0) {
const nameCandidate = target.children.filter((c) => c.type === "identifier");
if (nameCandidate.length < 1) {
if (target.firstChild) {
target = target.firstChild;
} else {
break;
}
} else {
return nameCandidate.at(0)?.text;
}
}
break;
case "rust":
if (node.type === "function_item") {
const nameNode = node.childForFieldName("name");
if (nameNode?.text) {
return nameNode.text;
}
}
break;
case "java":
if (node.type === "method_declaration") {
const nameNode = node.childForFieldName("name");
if (nameNode?.text) {
return nameNode.text;
}
}
break;
}
const identifierChild = node.children?.find?.(
(child) => child.type === "identifier"
);
return identifierChild?.text;
};
};
var createDocsExtracor = (language) => {
const extractOuterDocComment = (node) => {
let doc_candidate = node.previousSibling;
switch (language) {
case "javascript":
case "typescript":
if (node.parent?.type === "export_statement") {
doc_candidate = node.parent.previousSibling;
}
break;
case "python":
case "go":
case "rust":
case "java":
case "ruby":
case "c":
case "cpp":
case "html":
case "css":
case "bash":
break;
}
if (doc_candidate && doc_candidate.type.includes("comment")) {
return {
hasDocs: true,
detail: {
text: doc_candidate.text,
startIndex: doc_candidate.startIndex,
endIndex: doc_candidate.endIndex
}
};
}
return {
hasDocs: false
};
};
const extractPyDocComment = (node) => {
const doc_candidate = node.lastChild?.firstChild?.firstChild;
if (doc_candidate && doc_candidate.type === "string") {
return {
hasDocs: true,
detail: {
text: doc_candidate.text,
startIndex: doc_candidate.startIndex,
endIndex: doc_candidate.endIndex
}
};
}
return {
hasDocs: false
};
};
return (node) => {
switch (language) {
case "javascript":
case "typescript":
case "rust":
case "java":
case "ruby":
case "c":
case "cpp":
case "go":
return extractOuterDocComment(node);
case "python":
return extractPyDocComment(node);
case "html":
case "css":
case "bash":
return {
hasDocs: false
};
}
};
};
// src/chunking/cst-operations.ts
var createNodeTraverser = (language) => {
const boundaryNodeTypes = createBoundaryNodeTypes(language);
const extractName = createNodeNameExtractor(language);
const extractDocs = createDocsExtracor(language);
const isBoundary = (nodeType) => boundaryNodeTypes.has(nodeType);
const traverse = (node, filter) => {
const boundaries = [];
const visit = (node2, parentInfo) => {
const docs = extractDocs(node2);
const name = extractName(node2);
if (isBoundary(node2.type) && filter(language, node2)) {
boundaries.push({
type: node2.type,
parentInfo,
name,
startIndex: docs.hasDocs ? docs.detail.startIndex : node2.startIndex,
endIndex: node2.endIndex,
text: node2.text,
docsText: docs.hasDocs ? docs.detail.text : ""
});
parentInfo = name ? [...parentInfo, name] : parentInfo;
}
for (const child of node2.children) {
visit(child, parentInfo);
}
};
visit(node, []);
return boundaries;
};
return { traverse };
};
var createCSTOperations = (factory) => {
const parseAndExtractBoundaries = async (code, language, options) => {
const parser = await factory.createParser(language);
if (!parser) {
throw new Error(`No parser available for language: ${language}`);
}
const traverser = createNodeTraverser(language);
const tree = parser.parse(code);
return traverser.traverse(tree.rootNode, options.filter ?? (() => true));
};
const boundariesToChunks = (boundaries) => {
return boundaries.map((boundary) => ({
content: boundary.text,
startOffset: boundary.startIndex,
endOffset: boundary.endIndex,
// Because the actual value is inserted using the I/O functions defined in io/file-operations.ts, the result is an empty string.
filePath: "",
boundary: {
type: boundary.type,
name: boundary.name,
parent: boundary.parentInfo,
docs: boundary.docsText
}
}));
};
return { parseAndExtractBoundaries, boundariesToChunks };
};
var withCSTParsing = async (factory, operation) => {
const ops = createCSTOperations(factory);
try {
return await operation(ops);
} finally {
}
};
var createCSTChunkingOperations = () => {
const chunkWithCST = async (code, language, _options, factory) => {
return withCSTParsing(factory, async (ops) => {
const boundaries = await ops.parseAndExtractBoundaries(code, language, _options);
return ops.boundariesToChunks(boundaries);
});
};
const chunkWithFallback = async (code, filePath, options, fallback) => {
const ext = import_node_path.default.extname(filePath);
const language = getLanguageFromExtension(ext);
if (!language) {
return fallback(code, "unknown", options);
}
const factory = createParserFactory();
try {
return await chunkWithCST(code, language, options, factory);
} catch (error) {
return fallback(code, language, options);
} finally {
factory.dispose();
}
};
return { chunkWithCST, chunkWithFallback };
};
// src/io/file-operations.ts
var isSupportedLanguageExtension = (filename) => /\.(ts|js|tsx|jsx|py|java|cpp|c|h|cs|go|rb|php|go)/.test(filename);
var readFileAndChunk = async (factory, options, baseDirPath, relativeFilePath) => {
const code = await import_promises.default.readFile(import_node_path2.default.join(baseDirPath, relativeFilePath), "utf8");
const ext = import_node_path2.default.extname(relativeFilePath);
const language = getLanguageFromExtension(ext);
if (!language) {
return [];
}
const chunks = await parseCodeAndChunk(code, language, factory, options);
return chunks.map((c) => ({
...c,
filePath: relativeFilePath
}));
};
var readDirectoryAndChunk = async (factory, options, baseDirPath) => _readDirectoryAndChunkRecursive(factory, options, baseDirPath, "");
var _readDirectoryAndChunkRecursive = async (factory, options, baseDirPath, relativePath = "") => {
const currentPath = import_node_path2.default.join(baseDirPath, relativePath);
const entries = await import_promises.default.readdir(currentPath, { withFileTypes: true });
const excludeDirs = options.excludeDirs ?? [/node_modules/, /^\.[A-Za-z0-9_-]+$/];
const promises = entries.map((entry) => {
const newRelativePath = import_node_path2.default.join(relativePath, entry.name);
if (entry.isDirectory() && excludeDirs.every((e) => !e.test(entry.name))) {
return _readDirectoryAndChunkRecursive(factory, options, baseDirPath, newRelativePath);
}
if (isSupportedLanguageExtension(entry.name)) {
try {
return readFileAndChunk(factory, options, baseDirPath, newRelativePath);
} catch (error) {
return [];
}
}
return [];
});
const nestedResults = await Promise.all(promises);
return nestedResults.flat();
};
var parseCodeAndChunk = (code, language, factory, options) => {
const cstOperations = createCSTChunkingOperations();
return cstOperations.chunkWithCST(code, language, options, factory);
};
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
createParserFactory,
parseCodeAndChunk,
readDirectoryAndChunk,
readFileAndChunk
});
/*!
* Copyright (c) ushirononeko 2025
* Copyright (c) sirasagi62 2025
* Published under MIT License
* see https://opensource.org/licenses/MIT
*
* This code was originally created by ushirononeko and modified by sirasagi62
* Original: https://github.com/ushironoko/gistdex
*/
;