@langchain/community
Version:
Third-party integrations for LangChain.js
147 lines (146 loc) • 5.66 kB
JavaScript
import { __exportAll } from "../../_virtual/_rolldown/runtime.js";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
import { htmlToText } from "html-to-text";
//#region src/document_loaders/web/confluence.ts
var confluence_exports = /* @__PURE__ */ __exportAll({ ConfluencePagesLoader: () => ConfluencePagesLoader });
/**
* Class representing a document loader for loading pages from Confluence.
* @example
* ```typescript
* const loader = new ConfluencePagesLoader({
* baseUrl: "https:
* spaceKey: "~EXAMPLE362906de5d343d49dcdbae5dEXAMPLE",
* username: "your-username",
* accessToken: "your-access-token",
* });
* const documents = await loader.load();
* console.log(documents);
* ```
*/
var ConfluencePagesLoader = class extends BaseDocumentLoader {
baseUrl;
spaceKey;
username;
accessToken;
limit;
maxRetries;
/**
* expand parameter for confluence rest api
* description can be found at https://developer.atlassian.com/server/confluence/expansions-in-the-rest-api/
*/
expand;
personalAccessToken;
constructor({ baseUrl, spaceKey, username, accessToken, limit = 25, expand = "body.storage,version", personalAccessToken, maxRetries = 5 }) {
super();
this.baseUrl = baseUrl;
this.spaceKey = spaceKey;
this.username = username;
this.accessToken = accessToken;
this.limit = limit;
this.expand = expand;
this.personalAccessToken = personalAccessToken;
this.maxRetries = maxRetries;
}
/**
* Returns the authorization header for the request.
* @returns The authorization header as a string, or undefined if no credentials were provided.
*/
get authorizationHeader() {
if (this.personalAccessToken) return `Bearer ${this.personalAccessToken}`;
else if (this.username && this.accessToken) return `Basic ${Buffer.from(`${this.username}:${this.accessToken}`).toString("base64")}`;
}
/**
* Fetches all the pages in the specified space and converts each page to
* a Document instance.
* @param options the extra options of the load function
* @param options.limit The limit parameter to overwrite the size to fetch pages.
* @param options.start The start parameter to set inital offset to fetch pages.
* @returns Promise resolving to an array of Document instances.
*/
async load(options) {
try {
return (await this.fetchAllPagesInSpace(options?.start, options?.limit)).map((page) => this.createDocumentFromPage(page));
} catch (error) {
console.error("Error:", error);
return [];
}
}
/**
* Fetches data from the Confluence API using the provided URL.
* @param url The URL to fetch data from.
* @returns Promise resolving to the JSON response from the API.
*/
async fetchConfluenceData(url) {
let retryCounter = 0;
while (true) {
retryCounter += 1;
try {
const initialHeaders = {
"Content-Type": "application/json",
Accept: "application/json"
};
const authHeader = this.authorizationHeader;
if (authHeader) initialHeaders.Authorization = authHeader;
const response = await fetch(url, { headers: initialHeaders });
if (!response.ok) throw new Error(`Failed to fetch ${url} from Confluence: ${response.status}. Retrying...`);
return await response.json();
} catch (error) {
if (retryCounter >= this.maxRetries) throw new Error(`Failed to fetch ${url} from Confluence (retry: ${retryCounter}): ${error}`);
}
}
}
/**
* Recursively fetches all the pages in the specified space.
* @param start The start parameter to paginate through the results.
* @returns Promise resolving to an array of ConfluencePage objects.
*/
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
const data = await this.fetchConfluenceData(url);
if (data.size === 0) return [];
const nextPageStart = start + data.size;
const nextPageResults = await this.fetchAllPagesInSpace(nextPageStart, limit);
return data.results.concat(nextPageResults);
}
/**
* Creates a Document instance from a ConfluencePage object.
* @param page The ConfluencePage object to convert.
* @returns A Document instance.
*/
createDocumentFromPage(page) {
const htmlWithoutOtherMacros = page.body.storage.value.replace(/<ac:structured-macro\s+ac:name="(attachments|view-file)"[^>]*(?:\/?>|>.*?<\/ac:structured-macro>)/gs, "[ATTACHMENT]");
const codeBlocks = [];
let plainTextContent = htmlToText(htmlWithoutOtherMacros.replace(/<ac:structured-macro.*?<ac:parameter ac:name="language">(.*?)<\/ac:parameter>.*?<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body><\/ac:structured-macro>/g, (_, language, code) => {
const placeholder = `CODE_BLOCK_${codeBlocks.length}`;
codeBlocks.push({
language,
code: code.trim()
});
return `\n${placeholder}\n`;
}), {
wordwrap: false,
preserveNewlines: true
});
codeBlocks.forEach(({ language, code }, index) => {
const placeholder = `CODE_BLOCK_${index}`;
plainTextContent = plainTextContent.replace(placeholder, `\`\`\`${language}\n${code}\n\`\`\``);
});
return new Document({
pageContent: plainTextContent.replace(/^\s*[\r\n]/gm, ""),
metadata: {
id: page.id,
status: page.status,
title: page.title,
type: page.type,
url: `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`,
version: page.version?.number,
updated_by: page.version?.by?.displayName,
updated_at: page.version?.when
}
});
}
};
//#endregion
export { ConfluencePagesLoader, confluence_exports };
//# sourceMappingURL=confluence.js.map