@langchain/community
Version:
Third-party integrations for LangChain.js
1 lines • 5.17 kB
Source Map (JSON)
{"version":3,"file":"BM25.cjs","names":[],"sources":["../../../../src/utils/@furkantoprak/bm25/BM25.ts"],"sourcesContent":["/**\n * Adapted from\n * https://github.com/FurkanToprak/OkapiBM25\n *\n * Inlined due to CJS import issues.\n */\n\n/** Gets word count. */\nexport const getWordCount = (corpus: string) => {\n return ((corpus || \"\").match(/\\w+/g) || []).length;\n};\n\n/** Number of occurences of a word in a string. */\nexport const getTermFrequency = (term: string, corpus: string) => {\n // Escape any RegExp metacharacters in the term so constructing a RegExp\n // from user-provided or model-generated queries does not throw an error\n const escaped = (term || \"\").replace(/[.*+?^${}()|[\\]\\\\]/g, \"\\\\$&\");\n return ((corpus || \"\").match(new RegExp(escaped, \"g\")) || []).length;\n};\n\n/** Inverse document frequency. */\nexport const getIDF = <T>(term: string, documents: BMInputDocument<T>[]) => {\n // Number of relevant documents.\n const relevantDocuments = documents.filter((document) =>\n document.text.includes(term)\n ).length;\n return Math.log(\n (documents.length - relevantDocuments + 0.5) / (relevantDocuments + 0.5) + 1\n );\n};\n\nexport interface BMInputDocument<T> {\n /** The text from the original document */\n text: string;\n /** The original document */\n document: T;\n}\n\n/** Represents a document; useful when sorting results.\n */\nexport interface BMOutputDocument<T> {\n /** The original document */\n document: T;\n /** The score that the document receives. */\n score: number;\n}\n\n/** Constants that are free parameters used in BM25, specifically when generating inverse document frequency. */\nexport interface BMConstants {\n /** Free parameter. Is 0.75 by default. */\n b?: number;\n /** Free parameter. Is 1.2 by default. Generally in range [1.2, 2.0] */\n k1?: number;\n}\n\n/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL */\nexport type BMSorter<T> = (\n firstEl: BMOutputDocument<T>,\n secondEl: BMOutputDocument<T>\n) => number;\n\n/** Implementation of Okapi BM25 algorithm.\n * @param documents: Collection of documents with text content and associated data.\n * @param keywords: query terms.\n * @param constants: Contains free parameters k1 and b. b=0.75 and k1=1.2 by default.\n * @param sorter: A function that allows you to sort results by a given rule. If not provided, returns results in the original document order.\n */\nexport function BM25<T>(\n documents: BMInputDocument<T>[],\n keywords: string[],\n constants?: BMConstants,\n sorter?: BMSorter<T>\n): BMOutputDocument<T>[] {\n const b = constants && constants.b ? constants.b : 0.75;\n const k1 = constants && constants.k1 ? constants.k1 : 1.2;\n const documentLengths = documents.map((document) =>\n getWordCount(document.text)\n );\n const averageDocumentLength =\n documentLengths.reduce((a, b) => a + b, 0) / documents.length;\n const idfByKeyword = keywords.reduce((obj, keyword) => {\n obj.set(keyword, getIDF(keyword, documents));\n return obj;\n }, new Map<string, number>());\n\n const scoredDocs = documents.map(({ text, document }, index) => {\n const score = keywords\n .map((keyword: string) => {\n const inverseDocumentFrequency = idfByKeyword.get(keyword);\n if (inverseDocumentFrequency === undefined) {\n throw new Error(\"Missing keyword.\");\n }\n const termFrequency = getTermFrequency(keyword, text);\n const documentLength = documentLengths[index];\n return (\n (inverseDocumentFrequency * (termFrequency * (k1 + 1))) /\n (termFrequency +\n k1 * (1 - b + (b * documentLength) / averageDocumentLength))\n );\n })\n .reduce((a: number, b: number) => a + b, 0);\n return { score, document } as BMOutputDocument<T>;\n });\n // sort the results\n if (sorter) {\n return scoredDocs.sort(sorter);\n }\n return scoredDocs;\n}\n"],"mappings":";;;;;;;;AAQA,MAAa,gBAAgB,WAAmB;AAC9C,UAAS,UAAU,IAAI,MAAM,OAAO,IAAI,EAAE,EAAE;;;AAI9C,MAAa,oBAAoB,MAAc,WAAmB;CAGhE,MAAM,WAAW,QAAQ,IAAI,QAAQ,uBAAuB,OAAO;AACnE,UAAS,UAAU,IAAI,MAAM,IAAI,OAAO,SAAS,IAAI,CAAC,IAAI,EAAE,EAAE;;;AAIhE,MAAa,UAAa,MAAc,cAAoC;CAE1E,MAAM,oBAAoB,UAAU,QAAQ,aAC1C,SAAS,KAAK,SAAS,KAAK,CAC7B,CAAC;AACF,QAAO,KAAK,KACT,UAAU,SAAS,oBAAoB,OAAQ,oBAAoB,MAAO,EAC5E;;;;;;;;AAuCH,SAAgB,KACd,WACA,UACA,WACA,QACuB;CACvB,MAAM,IAAI,aAAa,UAAU,IAAI,UAAU,IAAI;CACnD,MAAM,KAAK,aAAa,UAAU,KAAK,UAAU,KAAK;CACtD,MAAM,kBAAkB,UAAU,KAAK,aACrC,aAAa,SAAS,KAAK,CAC5B;CACD,MAAM,wBACJ,gBAAgB,QAAQ,GAAG,MAAM,IAAI,GAAG,EAAE,GAAG,UAAU;CACzD,MAAM,eAAe,SAAS,QAAQ,KAAK,YAAY;AACrD,MAAI,IAAI,SAAS,OAAO,SAAS,UAAU,CAAC;AAC5C,SAAO;oBACN,IAAI,KAAqB,CAAC;CAE7B,MAAM,aAAa,UAAU,KAAK,EAAE,MAAM,YAAY,UAAU;AAgB9D,SAAO;GAAE,OAfK,SACX,KAAK,YAAoB;IACxB,MAAM,2BAA2B,aAAa,IAAI,QAAQ;AAC1D,QAAI,6BAA6B,KAAA,EAC/B,OAAM,IAAI,MAAM,mBAAmB;IAErC,MAAM,gBAAgB,iBAAiB,SAAS,KAAK;IACrD,MAAM,iBAAiB,gBAAgB;AACvC,WACG,4BAA4B,iBAAiB,KAAK,OAClD,gBACC,MAAM,IAAI,IAAK,IAAI,iBAAkB;KAEzC,CACD,QAAQ,GAAW,MAAc,IAAI,GAAG,EAAE;GAC7B;GAAU;GAC1B;AAEF,KAAI,OACF,QAAO,WAAW,KAAK,OAAO;AAEhC,QAAO"}