UNPKG

@langchain/community

Version:
426 lines (425 loc) 16.9 kB
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" }); const require_runtime = require("../../_virtual/_rolldown/runtime.cjs"); const require_extname = require("../../utils/extname.cjs"); let _langchain_core_utils_env = require("@langchain/core/utils/env"); let _langchain_core_utils_async_caller = require("@langchain/core/utils/async_caller"); let _langchain_core_documents = require("@langchain/core/documents"); let _langchain_core_document_loaders_base = require("@langchain/core/document_loaders/base"); let _langchain_classic_document_loaders_fs_directory = require("@langchain/classic/document_loaders/fs/directory"); let ignore = require("ignore"); ignore = require_runtime.__toESM(ignore); let binary_extensions = require("binary-extensions"); binary_extensions = require_runtime.__toESM(binary_extensions); //#region src/document_loaders/web/github.ts var github_exports = /* @__PURE__ */ require_runtime.__exportAll({ GithubRepoLoader: () => GithubRepoLoader }); const extensions = /* @__PURE__ */ new Set(binary_extensions.default); /** * A function that checks if a file path is a binary file based on its * extension. * @param name The file path to check. * @returns A boolean indicating whether the file path is a binary file. */ function isBinaryPath(name) { return extensions.has(require_extname.extname(name).slice(1).toLowerCase()); } /** * A class that extends the BaseDocumentLoader and implements the * GithubRepoLoaderParams interface. It represents a document loader for * loading files from a GitHub repository. */ var GithubRepoLoader = class GithubRepoLoader extends _langchain_core_document_loaders_base.BaseDocumentLoader { baseUrl; apiUrl; owner; repo; initialPath; headers = {}; branch; recursive; processSubmodules; unknown; accessToken; ignoreFiles; ignore; verbose; maxConcurrency; maxRetries; caller; ignorePaths; submoduleInfos; constructor(githubUrl, { accessToken = (0, _langchain_core_utils_env.getEnvironmentVariable)("GITHUB_ACCESS_TOKEN"), baseUrl = "https://github.com", apiUrl = "https://api.github.com", branch = "main", recursive = true, processSubmodules = false, unknown = _langchain_classic_document_loaders_fs_directory.UnknownHandling.Warn, ignoreFiles = [], ignorePaths, verbose = false, maxConcurrency = 2, maxRetries = 2, ...rest } = {}) { super(); this.baseUrl = baseUrl; this.apiUrl = apiUrl; const { owner, repo, path } = this.extractOwnerAndRepoAndPath(githubUrl); this.owner = owner; this.repo = repo; this.initialPath = path; this.branch = branch; this.recursive = recursive; if (processSubmodules && !recursive) throw new Error(`Input property "recursive" must be true if "processSubmodules" is true.`); this.processSubmodules = processSubmodules; this.unknown = unknown; this.accessToken = accessToken; this.ignoreFiles = ignoreFiles; this.verbose = verbose; this.maxConcurrency = maxConcurrency; this.maxRetries = maxRetries; this.headers = { "User-Agent": "langchain" }; this.caller = new _langchain_core_utils_async_caller.AsyncCaller({ maxConcurrency, maxRetries, ...rest }); this.ignorePaths = ignorePaths; if (ignorePaths) this.ignore = (0, ignore.default)().add(ignorePaths); if (this.accessToken) this.headers = { ...this.headers, Authorization: `Bearer ${this.accessToken}` }; } /** * Extracts the owner, repository, and path from a GitHub URL. * @param url The GitHub URL to extract information from. * @returns An object containing the owner, repository, and path extracted from the GitHub URL. */ extractOwnerAndRepoAndPath(url) { const match = url.match(new RegExp(`${this.baseUrl}/([^/]+)/([^/]+)(/tree/[^/]+/(.+))?`, "i")); if (!match) throw new Error("Invalid GitHub URL format."); return { owner: match[1], repo: match[2], path: match[4] || "" }; } /** * Fetches the files from the GitHub repository and creates Document * instances for each file. It also handles error handling based on the * unknown handling option. * @returns A promise that resolves to an array of Document instances. */ async load() { this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`); const documents = (await this.processRepo()).map((fileResponse) => new _langchain_core_documents.Document({ pageContent: fileResponse.contents, metadata: fileResponse.metadata })); if (this.processSubmodules) { await this.getSubmoduleInfo(); for (const submoduleInfo of this.submoduleInfos) documents.push(...await this.loadSubmodule(submoduleInfo)); } return documents; } /** * Asynchronously streams documents from the entire GitHub repository. * It is suitable for situations where processing large repositories in a memory-efficient manner is required. * @yields Yields a Promise that resolves to a Document object for each file or submodule content found in the repository. */ async *loadAsStream() { this.log(`Loading documents from ${this.baseUrl}/${this.owner}/${this.repo}/${this.initialPath}...`); yield* await this.processRepoAsStream(this.initialPath); if (!this.processSubmodules) return; await this.getSubmoduleInfo(); for (const submoduleInfo of this.submoduleInfos) yield* await this.loadSubmoduleAsStream(submoduleInfo); } /** * Loads the information about Git submodules from the repository, if available. */ async getSubmoduleInfo() { this.log("Loading info about submodules..."); const gitmodulesFile = (await this.fetchRepoFiles("")).filter(({ name }) => name === ".gitmodules")?.[0]; if (gitmodulesFile) { const gitmodulesContent = await this.fetchFileContent({ download_url: gitmodulesFile.download_url }); this.submoduleInfos = await this.parseGitmodules(gitmodulesContent); } else this.submoduleInfos = []; this.log(`Found ${this.submoduleInfos.length} submodules:`); for (const submoduleInfo of this.submoduleInfos) this.log(JSON.stringify(submoduleInfo)); } /** * Parses the given content of a .gitmodules file. Furthermore, queries the current SHA ref of all submodules. * Returns the submodule information as array. * @param gitmodulesContent the content of a .gitmodules file */ async parseGitmodules(gitmodulesContent) { let validGitmodulesContent = gitmodulesContent; if (!validGitmodulesContent.endsWith("\n")) validGitmodulesContent += "\n"; const submodulePattern = /\[submodule "(.*?)"]\n((\s+.*?\s*=\s*.*?\n)*)/g; const keyValuePattern = /\s+(.*?)\s*=\s*(.*?)\s/g; const submoduleInfos = []; for (const [, name, propertyLines] of validGitmodulesContent.matchAll(submodulePattern)) { if (!name || !propertyLines) throw new Error("Could not parse submodule entry"); const submodulePropertyLines = propertyLines.matchAll(keyValuePattern); let path; let url; for (const [, key, value] of submodulePropertyLines) { if (!key || !value) throw new Error(`Could not parse key/value pairs for submodule ${name}`); switch (key) { case "path": path = value; break; case "url": url = value; if (url.endsWith(".git")) url = url.substring(0, url.length - 4); break; default: } } if (!path || !url) throw new Error(`Missing properties for submodule ${name}`); const files = await this.fetchRepoFiles(path); const submoduleInfo = { name, path, url, ref: files[0].sha }; submoduleInfos.push(submoduleInfo); } return submoduleInfos; } /** * Loads the documents of the given submodule. Uses the same parameters as for the current repository. * External submodules, i.e. submodules pointing to another GitHub instance, are ignored. * @param submoduleInfo the info about the submodule to be loaded */ async loadSubmodule(submoduleInfo) { if (!submoduleInfo.url.startsWith(this.baseUrl)) { this.log(`Ignoring external submodule ${submoduleInfo.url}.`); return []; } else if (!submoduleInfo.path.startsWith(this.initialPath)) { this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`); return []; } else { this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`); return new GithubRepoLoader(submoduleInfo.url, { accessToken: this.accessToken, apiUrl: this.apiUrl, baseUrl: this.baseUrl, branch: submoduleInfo.ref, recursive: this.recursive, processSubmodules: this.processSubmodules, unknown: this.unknown, ignoreFiles: this.ignoreFiles, ignorePaths: this.ignorePaths, verbose: this.verbose, maxConcurrency: this.maxConcurrency, maxRetries: this.maxRetries }).load(); } } /** * Asynchronously processes and streams the contents of a specified submodule in the GitHub repository. * @param submoduleInfo the info about the submodule to be loaded * @yields Yields a Promise that resolves to a Document object for each file found in the submodule. */ async *loadSubmoduleAsStream(submoduleInfo) { if (!submoduleInfo.url.startsWith(this.baseUrl)) { this.log(`Ignoring external submodule ${submoduleInfo.url}.`); yield* []; } if (!submoduleInfo.path.startsWith(this.initialPath)) { this.log(`Ignoring submodule ${submoduleInfo.url}, as it is not on initial path.`); yield* []; } this.log(`Accessing submodule ${submoduleInfo.name} (${submoduleInfo.url})...`); yield* await new GithubRepoLoader(submoduleInfo.url, { accessToken: this.accessToken, baseUrl: this.baseUrl, apiUrl: this.apiUrl, branch: submoduleInfo.ref, recursive: this.recursive, processSubmodules: this.processSubmodules, unknown: this.unknown, ignoreFiles: this.ignoreFiles, ignorePaths: this.ignorePaths, verbose: this.verbose, maxConcurrency: this.maxConcurrency, maxRetries: this.maxRetries }).processRepoAsStream(submoduleInfo.path); } /** * Determines whether a file or directory should be ignored based on its * path and type. * @param path The path of the file or directory. * @param fileType The type of the file or directory. * @returns A boolean indicating whether the file or directory should be ignored. */ shouldIgnore(path, fileType) { if (fileType !== "dir" && isBinaryPath(path)) return true; if (this.ignore !== void 0) return this.ignore.ignores(path); return fileType !== "dir" && this.ignoreFiles.some((pattern) => { if (typeof pattern === "string") return path === pattern; try { return pattern.test(path); } catch { throw new Error(`Unknown ignore file pattern: ${pattern}`); } }); } /** * Takes the file info and wrap it in a promise that will resolve to the file content and metadata * @param file * @returns */ async fetchFileContentWrapper(file) { return { contents: await this.fetchFileContent(file).catch((error) => { this.handleError(`Failed wrap file content: ${file}, ${error}`); }) || "", metadata: { source: file.path, repository: `${this.baseUrl}/${this.owner}/${this.repo}`, branch: this.branch } }; } /** * Maps a list of files / directories to a list of promises that will fetch the file / directory contents */ async getCurrentDirectoryFilePromises(files) { const currentDirectoryFilePromises = []; const currentDirectoryDirectoryPromises = []; for (const file of files) { if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) continue; if (file.type === "file" && file.size === 0) continue; if (file.type !== "dir") try { currentDirectoryFilePromises.push(this.fetchFileContentWrapper(file)); } catch (e) { this.handleError(`Failed to fetch file content: ${file.path}, ${e}`); } else if (this.recursive) currentDirectoryDirectoryPromises.push(this.processDirectory(file.path)); } const curDirDirectories = await Promise.all(currentDirectoryDirectoryPromises); return [...currentDirectoryFilePromises, ...curDirDirectories.flat()]; } /** * Begins the process of fetching the contents of the repository */ async processRepo() { try { const files = await this.fetchRepoFiles(this.initialPath); const currentDirectoryFilePromises = await this.getCurrentDirectoryFilePromises(files); return Promise.all(currentDirectoryFilePromises); } catch (error) { this.handleError(`Failed to process directory: ${this.initialPath}, ${error}`); return Promise.reject(error); } } /** * Asynchronously processes the contents of the entire GitHub repository, * streaming each file as a Document object. * @param path The path of the directory to process. * @yields Yields a Promise that resolves to a Document object for each file found in the repository. */ async *processRepoAsStream(path) { const files = await this.fetchRepoFiles(path); for (const file of files) { if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) continue; if (file.type === "file") try { const fileResponse = await this.fetchFileContentWrapper(file); yield new _langchain_core_documents.Document({ pageContent: fileResponse.contents, metadata: fileResponse.metadata }); } catch (error) { this.handleError(`Failed to fetch file content: ${file.path}, ${error}`); } else if (this.recursive) yield* await this.processDirectoryAsStream(file.path); } } /** * Fetches the contents of a directory and maps the file / directory paths * to promises that will fetch the file / directory contents. * @param path The path of the directory to process. * @returns A promise that resolves to an array of promises that will fetch the file / directory contents. */ async processDirectory(path) { try { const files = await this.fetchRepoFiles(path); return this.getCurrentDirectoryFilePromises(files); } catch (error) { this.handleError(`Failed to process directory: ${path}, ${error}`); return Promise.reject(error); } } /** * Asynchronously processes the contents of a given directory in the GitHub repository, * streaming each file as a Document object. * @param path The path of the directory to process. * @yields Yields a Promise that resolves to a Document object for each file in the directory. */ async *processDirectoryAsStream(path) { const files = await this.fetchRepoFiles(path); for (const file of files) { if (file.type !== "dir" && this.shouldIgnore(file.path, file.type)) continue; if (file.type === "file") try { const fileResponse = await this.fetchFileContentWrapper(file); yield new _langchain_core_documents.Document({ pageContent: fileResponse.contents, metadata: fileResponse.metadata }); } catch { this.handleError(`Failed to fetch file content: ${file.path}`); } else if (this.recursive) yield* await this.processDirectoryAsStream(file.path); } } /** * Fetches the files from a GitHub repository. * If the path denotes a single file, the resulting array contains only one element. * @param path The path of the repository to fetch the files from. * @returns A promise that resolves to an array of GithubFile instances. */ async fetchRepoFiles(path) { const url = `${this.apiUrl}/repos/${this.owner}/${this.repo}/contents/${encodeURIComponent(path)}?ref=${this.branch}`; return this.caller.call(async () => { this.log(`Fetching ${url}`); const response = await fetch(url, { headers: this.headers }); const data = await response.json(); if (!response.ok) throw new Error(`Unable to fetch repository files: ${response.status} ${JSON.stringify(data)}`); if (Array.isArray(data)) return data; else return [data]; }); } /** * Fetches the content of a file from a GitHub repository. * @param file The file to fetch the content from. * @returns A promise that resolves to the content of the file. */ async fetchFileContent(file) { return this.caller.call(async () => { this.log(`Fetching ${file.download_url}`); return (await fetch(file.download_url, { headers: this.headers })).text(); }); } /** * Handles errors based on the unknown handling option. * @param message The error message. * @returns void */ handleError(message) { switch (this.unknown) { case _langchain_classic_document_loaders_fs_directory.UnknownHandling.Ignore: break; case _langchain_classic_document_loaders_fs_directory.UnknownHandling.Warn: console.warn(message); break; case _langchain_classic_document_loaders_fs_directory.UnknownHandling.Error: throw new Error(message); default: throw new Error(`Unknown unknown handling: ${this.unknown}`); } } /** * Logs the given message to the console, if parameter 'verbose' is set to true. * @param message the message to be logged. */ log(message) { if (this.verbose) console.log(message); } }; //#endregion exports.GithubRepoLoader = GithubRepoLoader; Object.defineProperty(exports, "github_exports", { enumerable: true, get: function() { return github_exports; } }); //# sourceMappingURL=github.cjs.map