llamaindex
Version:
<p align="center"> <img height="100" width="100" alt="LlamaIndex logo" src="https://ts.llamaindex.ai/square.svg" /> </p> <h1 align="center">LlamaIndex.TS</h1> <h3 align="center"> Data framework for your LLM application. </h3>
340 lines (335 loc) • 13.2 kB
JavaScript
Object.defineProperty(exports, '__esModule', { value: true });
var prompts = require('@llamaindex/core/prompts');
var schema = require('@llamaindex/core/schema');
var openai = require('@llamaindex/openai');
/*
* Abstract class for all extractors.
*/ class BaseExtractor extends schema.TransformComponent {
constructor(){
// eslint-disable-next-line @typescript-eslint/no-explicit-any
super(async (nodes, options)=>{
return this.processNodes(nodes, options?.excludedEmbedMetadataKeys, options?.excludedLlmMetadataKeys);
}), this.isTextNodeOnly = true, this.showProgress = true, this.metadataMode = schema.MetadataMode.ALL, this.disableTemplateRewrite = false, this.inPlace = true, this.numWorkers = 4;
}
/**
*
* @param nodes Nodes to extract metadata from.
* @param excludedEmbedMetadataKeys Metadata keys to exclude from the embedding.
* @param excludedLlmMetadataKeys Metadata keys to exclude from the LLM.
* @returns Metadata extracted from the nodes.
*/ async processNodes(nodes, excludedEmbedMetadataKeys = undefined, excludedLlmMetadataKeys = undefined) {
let newNodes;
if (this.inPlace) {
newNodes = nodes;
} else {
newNodes = nodes.slice();
}
const curMetadataList = await this.extract(newNodes);
for(const idx in newNodes){
newNodes[idx].metadata = {
...newNodes[idx].metadata,
...curMetadataList[idx]
};
}
for(const idx in newNodes){
if (excludedEmbedMetadataKeys) {
newNodes[idx].excludedEmbedMetadataKeys.concat(excludedEmbedMetadataKeys);
}
if (excludedLlmMetadataKeys) {
newNodes[idx].excludedLlmMetadataKeys.concat(excludedLlmMetadataKeys);
}
if (!this.disableTemplateRewrite) {
if (newNodes[idx] instanceof schema.TextNode) {
newNodes[idx] = new schema.TextNode({
...newNodes[idx],
textTemplate: prompts.defaultNodeTextTemplate.format()
});
}
}
}
return newNodes;
}
}
const STRIP_REGEX = /(\r\n|\n|\r)/gm;
/**
* Extract keywords from a list of nodes.
*/ class KeywordExtractor extends BaseExtractor {
/**
* Constructor for the KeywordExtractor class.
* @param {LLM} llm LLM instance.
* @param {number} keywords Number of keywords to extract.
* @throws {Error} If keywords is less than 1.
*/ constructor(options){
if (options?.keywords && options.keywords < 1) throw new Error("Keywords must be greater than 0");
super(), /**
* Number of keywords to extract.
* @type {number}
* @default 5
*/ this.keywords = 5;
this.llm = options?.llm ?? new openai.OpenAI();
this.keywords = options?.keywords ?? 5;
this.promptTemplate = options?.promptTemplate ? new prompts.PromptTemplate({
templateVars: [
"context",
"maxKeywords"
],
template: options.promptTemplate
}) : prompts.defaultKeywordExtractPrompt;
}
/**
*
* @param node Node to extract keywords from.
* @returns Keywords extracted from the node.
*/ async extractKeywordsFromNodes(node) {
if (this.isTextNodeOnly && !(node instanceof schema.TextNode)) {
return {};
}
const completion = await this.llm.complete({
prompt: this.promptTemplate.format({
context: node.getContent(schema.MetadataMode.ALL),
maxKeywords: this.keywords.toString()
})
});
return {
excerptKeywords: completion.text
};
}
/**
*
* @param nodes Nodes to extract keywords from.
* @returns Keywords extracted from the nodes.
*/ async extract(nodes) {
const results = await Promise.all(nodes.map((node)=>this.extractKeywordsFromNodes(node)));
return results;
}
}
/**
* Extract title from a list of nodes.
*/ class TitleExtractor extends BaseExtractor {
/**
* Constructor for the TitleExtractor class.
* @param {LLM} llm LLM instance.
* @param {number} nodes Number of nodes to extract titles from.
* @param {TitleExtractorPrompt} nodeTemplate The prompt template to use for the title extractor.
* @param {string} combineTemplate The prompt template to merge title with..
*/ constructor(options){
super(), /**
* Can work for mixture of text and non-text nodes
* @type {boolean}
* @default false
*/ this.isTextNodeOnly = false, /**
* Number of nodes to extrct titles from.
* @type {number}
* @default 5
*/ this.nodes = 5;
this.llm = options?.llm ?? new openai.OpenAI();
this.nodes = options?.nodes ?? 5;
this.nodeTemplate = options?.nodeTemplate ? new prompts.PromptTemplate({
templateVars: [
"context"
],
template: options.nodeTemplate
}) : prompts.defaultTitleExtractorPromptTemplate;
this.combineTemplate = options?.combineTemplate ? new prompts.PromptTemplate({
templateVars: [
"context"
],
template: options.combineTemplate
}) : prompts.defaultTitleCombinePromptTemplate;
}
/**
* Extract titles from a list of nodes.
* @param {BaseNode[]} nodes Nodes to extract titles from.
* @returns {Promise<BaseNode<ExtractTitle>[]>} Titles extracted from the nodes.
*/ async extract(nodes) {
const nodesToExtractTitle = this.filterNodes(nodes);
if (!nodesToExtractTitle.length) {
return [];
}
const nodesByDocument = this.separateNodesByDocument(nodesToExtractTitle);
const titlesByDocument = await this.extractTitles(nodesByDocument);
return nodesToExtractTitle.map((node)=>{
return {
documentTitle: titlesByDocument[node.sourceNode?.nodeId ?? ""]
};
});
}
filterNodes(nodes) {
return nodes.filter((node)=>{
if (this.isTextNodeOnly && !(node instanceof schema.TextNode)) {
return false;
}
return true;
});
}
separateNodesByDocument(nodes) {
const nodesByDocument = {};
for (const node of nodes){
const parentNode = node.sourceNode?.nodeId;
if (!parentNode) {
continue;
}
if (!nodesByDocument[parentNode]) {
nodesByDocument[parentNode] = [];
}
nodesByDocument[parentNode].push(node);
}
return nodesByDocument;
}
async extractTitles(nodesByDocument) {
const titlesByDocument = {};
for (const [key, nodes] of Object.entries(nodesByDocument)){
const titleCandidates = await this.getTitlesCandidates(nodes);
const combinedTitles = titleCandidates.join(", ");
const completion = await this.llm.complete({
prompt: this.combineTemplate.format({
context: combinedTitles
})
});
titlesByDocument[key] = completion.text;
}
return titlesByDocument;
}
async getTitlesCandidates(nodes) {
const titleJobs = nodes.map(async (node)=>{
const completion = await this.llm.complete({
prompt: this.nodeTemplate.format({
context: node.getContent(schema.MetadataMode.ALL)
})
});
return completion.text;
});
return await Promise.all(titleJobs);
}
}
/**
* Extract questions from a list of nodes.
*/ class QuestionsAnsweredExtractor extends BaseExtractor {
/**
* Constructor for the QuestionsAnsweredExtractor class.
* @param {LLM} llm LLM instance.
* @param {number} questions Number of questions to generate.
* @param {TextQAPrompt} promptTemplate The prompt template to use for the question extractor.
* @param {boolean} embeddingOnly Wheter to use metadata for embeddings only.
*/ constructor(options){
if (options?.questions && options.questions < 1) throw new Error("Questions must be greater than 0");
super(), /**
* Number of questions to generate.
* @type {number}
* @default 5
*/ this.questions = 5, /**
* Wheter to use metadata for embeddings only
* @type {boolean}
* @default false
*/ this.embeddingOnly = false;
this.llm = options?.llm ?? new openai.OpenAI();
this.questions = options?.questions ?? 5;
this.promptTemplate = options?.promptTemplate ? new prompts.PromptTemplate({
templateVars: [
"numQuestions",
"context"
],
template: options.promptTemplate
}).partialFormat({
numQuestions: "5"
}) : prompts.defaultQuestionExtractPrompt;
this.embeddingOnly = options?.embeddingOnly ?? false;
}
/**
* Extract answered questions from a node.
* @param {BaseNode} node Node to extract questions from.
* @returns {Promise<Array<ExtractQuestion> | Array<{}>>} Questions extracted from the node.
*/ async extractQuestionsFromNode(node) {
if (this.isTextNodeOnly && !(node instanceof schema.TextNode)) {
return {};
}
const contextStr = node.getContent(this.metadataMode);
const prompt = this.promptTemplate.format({
context: contextStr,
numQuestions: this.questions.toString()
});
const questions = await this.llm.complete({
prompt
});
return {
questionsThisExcerptCanAnswer: questions.text.replace(STRIP_REGEX, "")
};
}
/**
* Extract answered questions from a list of nodes.
* @param {BaseNode[]} nodes Nodes to extract questions from.
* @returns {Promise<Array<ExtractQuestion> | Array<{}>>} Questions extracted from the nodes.
*/ async extract(nodes) {
const results = await Promise.all(nodes.map((node)=>this.extractQuestionsFromNode(node)));
return results;
}
}
/**
* Extract summary from a list of nodes.
*/ class SummaryExtractor extends BaseExtractor {
constructor(options){
const summaries = options?.summaries ?? [
"self"
];
if (summaries && !summaries.some((s)=>[
"self",
"prev",
"next"
].includes(s))) throw new Error("Summaries must be one of 'self', 'prev', 'next'");
super();
this.llm = options?.llm ?? new openai.OpenAI();
this.summaries = summaries;
this.promptTemplate = options?.promptTemplate ? new prompts.PromptTemplate({
templateVars: [
"context"
],
template: options.promptTemplate
}) : prompts.defaultSummaryPrompt;
this.selfSummary = summaries?.includes("self") ?? false;
this.prevSummary = summaries?.includes("prev") ?? false;
this.nextSummary = summaries?.includes("next") ?? false;
}
/**
* Extract summary from a node.
* @param {BaseNode} node Node to extract summary from.
* @returns {Promise<string>} Summary extracted from the node.
*/ async generateNodeSummary(node) {
if (this.isTextNodeOnly && !(node instanceof schema.TextNode)) {
return "";
}
const context = node.getContent(this.metadataMode);
const prompt = this.promptTemplate.format({
context
});
const summary = await this.llm.complete({
prompt
});
return summary.text.replace(STRIP_REGEX, "");
}
/**
* Extract summaries from a list of nodes.
* @param {BaseNode[]} nodes Nodes to extract summaries from.
* @returns {Promise<Array<ExtractSummary> | Arry<{}>>} Summaries extracted from the nodes.
*/ async extract(nodes) {
if (!nodes.every((n)=>n instanceof schema.TextNode)) throw new Error("Only `TextNode` is allowed for `Summary` extractor");
const nodeSummaries = await Promise.all(nodes.map((node)=>this.generateNodeSummary(node)));
const metadataList = nodes.map(()=>({}));
for(let i = 0; i < nodes.length; i++){
if (i > 0 && this.prevSummary && nodeSummaries[i - 1]) {
metadataList[i]["prevSectionSummary"] = nodeSummaries[i - 1];
}
if (i < nodes.length - 1 && this.nextSummary && nodeSummaries[i + 1]) {
metadataList[i]["nextSectionSummary"] = nodeSummaries[i + 1];
}
if (this.selfSummary && nodeSummaries[i]) {
metadataList[i]["sectionSummary"] = nodeSummaries[i];
}
}
return metadataList;
}
}
exports.BaseExtractor = BaseExtractor;
exports.KeywordExtractor = KeywordExtractor;
exports.QuestionsAnsweredExtractor = QuestionsAnsweredExtractor;
exports.SummaryExtractor = SummaryExtractor;
exports.TitleExtractor = TitleExtractor;