UNPKG

@langchain/community

Version:
1 lines 10.4 kB
{"version":3,"file":"confluence.cjs","names":["BaseDocumentLoader","Document"],"sources":["../../../src/document_loaders/web/confluence.ts"],"sourcesContent":["import { htmlToText } from \"html-to-text\";\nimport { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\n/**\n * Interface representing the parameters for configuring the\n * ConfluencePagesLoader.\n */\nexport interface ConfluencePagesLoaderParams {\n baseUrl: string;\n spaceKey: string;\n username?: string;\n accessToken?: string;\n personalAccessToken?: string;\n limit?: number;\n expand?: string;\n maxRetries?: number;\n}\n\n/**\n * Interface representing a Confluence page.\n */\nexport interface ConfluencePage {\n id: string;\n title: string;\n type: string;\n body: {\n storage: {\n value: string;\n };\n };\n status: string;\n version?: {\n number: number;\n when: string;\n by: {\n displayName: string;\n };\n };\n}\n\n/**\n * Interface representing the response from the Confluence API.\n */\nexport interface ConfluenceAPIResponse {\n size: number;\n results: ConfluencePage[];\n}\n\n/**\n * Class representing a document loader for loading pages from Confluence.\n * @example\n * ```typescript\n * const loader = new ConfluencePagesLoader({\n * baseUrl: \"https:\n * spaceKey: \"~EXAMPLE362906de5d343d49dcdbae5dEXAMPLE\",\n * username: \"your-username\",\n * accessToken: \"your-access-token\",\n * });\n * const documents = await loader.load();\n * console.log(documents);\n * ```\n */\nexport class ConfluencePagesLoader extends BaseDocumentLoader {\n public readonly baseUrl: string;\n\n public readonly spaceKey: string;\n\n public readonly username?: string;\n\n public readonly accessToken?: string;\n\n public readonly limit: number;\n\n public readonly maxRetries: number;\n\n /**\n * expand parameter for confluence rest api\n * description can be found at https://developer.atlassian.com/server/confluence/expansions-in-the-rest-api/\n */\n public readonly expand?: string;\n\n public readonly personalAccessToken?: string;\n\n constructor({\n baseUrl,\n spaceKey,\n username,\n accessToken,\n limit = 25,\n expand = \"body.storage,version\",\n personalAccessToken,\n maxRetries = 5,\n }: ConfluencePagesLoaderParams) {\n super();\n this.baseUrl = baseUrl;\n this.spaceKey = spaceKey;\n this.username = username;\n this.accessToken = accessToken;\n this.limit = limit;\n this.expand = expand;\n this.personalAccessToken = personalAccessToken;\n this.maxRetries = maxRetries;\n }\n\n /**\n * Returns the authorization header for the request.\n * @returns The authorization header as a string, or undefined if no credentials were provided.\n */\n private get authorizationHeader(): string | undefined {\n if (this.personalAccessToken) {\n return `Bearer ${this.personalAccessToken}`;\n } else if (this.username && this.accessToken) {\n const authToken = Buffer.from(\n `${this.username}:${this.accessToken}`\n ).toString(\"base64\");\n return `Basic ${authToken}`;\n }\n\n return undefined;\n }\n\n /**\n * Fetches all the pages in the specified space and converts each page to\n * a Document instance.\n * @param options the extra options of the load function\n * @param options.limit The limit parameter to overwrite the size to fetch pages.\n * @param options.start The start parameter to set inital offset to fetch pages.\n * @returns Promise resolving to an array of Document instances.\n */\n public async load(options?: {\n start?: number;\n limit?: number;\n }): Promise<Document[]> {\n try {\n const pages = await this.fetchAllPagesInSpace(\n options?.start,\n options?.limit\n );\n return pages.map((page) => this.createDocumentFromPage(page));\n } catch (error) {\n console.error(\"Error:\", error);\n return [];\n }\n }\n\n /**\n * Fetches data from the Confluence API using the provided URL.\n * @param url The URL to fetch data from.\n * @returns Promise resolving to the JSON response from the API.\n */\n protected async fetchConfluenceData(\n url: string\n ): Promise<ConfluenceAPIResponse> {\n let retryCounter = 0;\n while (true) {\n retryCounter += 1;\n try {\n const initialHeaders: HeadersInit = {\n \"Content-Type\": \"application/json\",\n Accept: \"application/json\",\n };\n\n const authHeader = this.authorizationHeader;\n if (authHeader) {\n initialHeaders.Authorization = authHeader;\n }\n\n const response = await fetch(url, {\n headers: initialHeaders,\n });\n\n if (!response.ok) {\n throw new Error(\n `Failed to fetch ${url} from Confluence: ${response.status}. Retrying...`\n );\n }\n\n return await response.json();\n } catch (error) {\n if (retryCounter >= this.maxRetries)\n throw new Error(\n `Failed to fetch ${url} from Confluence (retry: ${retryCounter}): ${error}`\n );\n }\n }\n }\n\n /**\n * Recursively fetches all the pages in the specified space.\n * @param start The start parameter to paginate through the results.\n * @returns Promise resolving to an array of ConfluencePage objects.\n */\n private async fetchAllPagesInSpace(\n start = 0,\n limit = this.limit\n ): Promise<ConfluencePage[]> {\n const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;\n const data = await this.fetchConfluenceData(url);\n\n if (data.size === 0) {\n return [];\n }\n\n const nextPageStart = start + data.size;\n const nextPageResults = await this.fetchAllPagesInSpace(\n nextPageStart,\n limit\n );\n\n return data.results.concat(nextPageResults);\n }\n\n /**\n * Creates a Document instance from a ConfluencePage object.\n * @param page The ConfluencePage object to convert.\n * @returns A Document instance.\n */\n private createDocumentFromPage(page: ConfluencePage): Document {\n const htmlContent = page.body.storage.value;\n\n // Handle both self-closing and regular macros for attachments and view-file\n const htmlWithoutOtherMacros = htmlContent.replace(\n /<ac:structured-macro\\s+ac:name=\"(attachments|view-file)\"[^>]*(?:\\/?>|>.*?<\\/ac:structured-macro>)/gs,\n \"[ATTACHMENT]\"\n );\n\n // Extract and preserve code blocks with unique placeholders\n const codeBlocks: { language: string; code: string }[] = [];\n const htmlWithPlaceholders = htmlWithoutOtherMacros.replace(\n /<ac:structured-macro.*?<ac:parameter ac:name=\"language\">(.*?)<\\/ac:parameter>.*?<ac:plain-text-body><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/ac:plain-text-body><\\/ac:structured-macro>/g,\n (_, language, code) => {\n const placeholder = `CODE_BLOCK_${codeBlocks.length}`;\n codeBlocks.push({ language, code: code.trim() });\n return `\\n${placeholder}\\n`;\n }\n );\n\n // Convert the HTML content to plain text\n let plainTextContent = htmlToText(htmlWithPlaceholders, {\n wordwrap: false,\n preserveNewlines: true,\n });\n\n // Reinsert code blocks with proper markdown formatting\n codeBlocks.forEach(({ language, code }, index) => {\n const placeholder = `CODE_BLOCK_${index}`;\n plainTextContent = plainTextContent.replace(\n placeholder,\n `\\`\\`\\`${language}\\n${code}\\n\\`\\`\\``\n );\n });\n\n // Remove empty lines\n const textWithoutEmptyLines = plainTextContent.replace(/^\\s*[\\r\\n]/gm, \"\");\n\n // Rest of the method remains the same...\n return new Document({\n pageContent: textWithoutEmptyLines,\n metadata: {\n id: page.id,\n status: page.status,\n title: page.title,\n type: page.type,\n url: `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`,\n version: page.version?.number,\n updated_by: page.version?.by?.displayName,\n updated_at: page.version?.when,\n },\n });\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;AA+DA,IAAa,wBAAb,cAA2CA,sCAAAA,mBAAmB;CAC5D;CAEA;CAEA;CAEA;CAEA;CAEA;;;;;CAMA;CAEA;CAEA,YAAY,EACV,SACA,UACA,UACA,aACA,QAAQ,IACR,SAAS,wBACT,qBACA,aAAa,KACiB;AAC9B,SAAO;AACP,OAAK,UAAU;AACf,OAAK,WAAW;AAChB,OAAK,WAAW;AAChB,OAAK,cAAc;AACnB,OAAK,QAAQ;AACb,OAAK,SAAS;AACd,OAAK,sBAAsB;AAC3B,OAAK,aAAa;;;;;;CAOpB,IAAY,sBAA0C;AACpD,MAAI,KAAK,oBACP,QAAO,UAAU,KAAK;WACb,KAAK,YAAY,KAAK,YAI/B,QAAO,SAHW,OAAO,KACvB,GAAG,KAAK,SAAS,GAAG,KAAK,cAC1B,CAAC,SAAS,SAAS;;;;;;;;;;CAexB,MAAa,KAAK,SAGM;AACtB,MAAI;AAKF,WAJc,MAAM,KAAK,qBACvB,SAAS,OACT,SAAS,MACV,EACY,KAAK,SAAS,KAAK,uBAAuB,KAAK,CAAC;WACtD,OAAO;AACd,WAAQ,MAAM,UAAU,MAAM;AAC9B,UAAO,EAAE;;;;;;;;CASb,MAAgB,oBACd,KACgC;EAChC,IAAI,eAAe;AACnB,SAAO,MAAM;AACX,mBAAgB;AAChB,OAAI;IACF,MAAM,iBAA8B;KAClC,gBAAgB;KAChB,QAAQ;KACT;IAED,MAAM,aAAa,KAAK;AACxB,QAAI,WACF,gBAAe,gBAAgB;IAGjC,MAAM,WAAW,MAAM,MAAM,KAAK,EAChC,SAAS,gBACV,CAAC;AAEF,QAAI,CAAC,SAAS,GACZ,OAAM,IAAI,MACR,mBAAmB,IAAI,oBAAoB,SAAS,OAAO,eAC5D;AAGH,WAAO,MAAM,SAAS,MAAM;YACrB,OAAO;AACd,QAAI,gBAAgB,KAAK,WACvB,OAAM,IAAI,MACR,mBAAmB,IAAI,2BAA2B,aAAa,KAAK,QACrE;;;;;;;;;CAUT,MAAc,qBACZ,QAAQ,GACR,QAAQ,KAAK,OACc;EAC3B,MAAM,MAAM,GAAG,KAAK,QAAQ,6BAA6B,KAAK,SAAS,SAAS,MAAM,SAAS,MAAM,UAAU,KAAK;EACpH,MAAM,OAAO,MAAM,KAAK,oBAAoB,IAAI;AAEhD,MAAI,KAAK,SAAS,EAChB,QAAO,EAAE;EAGX,MAAM,gBAAgB,QAAQ,KAAK;EACnC,MAAM,kBAAkB,MAAM,KAAK,qBACjC,eACA,MACD;AAED,SAAO,KAAK,QAAQ,OAAO,gBAAgB;;;;;;;CAQ7C,uBAA+B,MAAgC;EAI7D,MAAM,yBAHc,KAAK,KAAK,QAAQ,MAGK,QACzC,uGACA,eACD;EAGD,MAAM,aAAmD,EAAE;EAW3D,IAAI,oBAAA,GAAA,aAAA,YAVyB,uBAAuB,QAClD,iLACC,GAAG,UAAU,SAAS;GACrB,MAAM,cAAc,cAAc,WAAW;AAC7C,cAAW,KAAK;IAAE;IAAU,MAAM,KAAK,MAAM;IAAE,CAAC;AAChD,UAAO,KAAK,YAAY;IAE3B,EAGuD;GACtD,UAAU;GACV,kBAAkB;GACnB,CAAC;AAGF,aAAW,SAAS,EAAE,UAAU,QAAQ,UAAU;GAChD,MAAM,cAAc,cAAc;AAClC,sBAAmB,iBAAiB,QAClC,aACA,SAAS,SAAS,IAAI,KAAK,UAC5B;IACD;AAMF,SAAO,IAAIC,0BAAAA,SAAS;GAClB,aAJ4B,iBAAiB,QAAQ,gBAAgB,GAAG;GAKxE,UAAU;IACR,IAAI,KAAK;IACT,QAAQ,KAAK;IACb,OAAO,KAAK;IACZ,MAAM,KAAK;IACX,KAAK,GAAG,KAAK,QAAQ,UAAU,KAAK,SAAS,SAAS,KAAK;IAC3D,SAAS,KAAK,SAAS;IACvB,YAAY,KAAK,SAAS,IAAI;IAC9B,YAAY,KAAK,SAAS;IAC3B;GACF,CAAC"}