llamaindex
Version:
<p align="center"> <img height="100" width="100" alt="LlamaIndex logo" src="https://ts.llamaindex.ai/square.svg" /> </p> <h1 align="center">LlamaIndex.TS</h1> <h3 align="center"> Data framework for your LLM application. </h3>
584 lines (583 loc) • 16.1 kB
JavaScript
// generate from "tsup ./src/index.js --format esm"
var __getOwnPropNames = Object.getOwnPropertyNames;
var __commonJS = (cb, mod)=>function __require() {
return mod || (0, cb[__getOwnPropNames(cb)[0]])((mod = {
exports: {}
}).exports, mod), mod.exports;
};
// src/stopwords.js
var require_stopwords = __commonJS({
"src/stopwords.js" (exports, module) {
"use strict";
module.exports = {
stopwords: [
"a",
"about",
"above",
"across",
"after",
"again",
"against",
"all",
"almost",
"alone",
"along",
"already",
"also",
"although",
"always",
"among",
"an",
"and",
"another",
"any",
"anybody",
"anyone",
"anything",
"anywhere",
"are",
"area",
"areas",
"around",
"as",
"ask",
"asked",
"asking",
"asks",
"at",
"away",
"b",
"back",
"backed",
"backing",
"backs",
"be",
"because",
"become",
"becomes",
"became",
"been",
"before",
"began",
"behind",
"being",
"beings",
"best",
"better",
"between",
"big",
"both",
"but",
"by",
"c",
"came",
"can",
"cannot",
"case",
"cases",
"certain",
"certainly",
"clear",
"clearly",
"come",
"contains",
"could",
"d",
"did",
"differ",
"different",
"differently",
"do",
"does",
"done",
"down",
"downed",
"downing",
"downs",
"during",
"e",
"each",
"early",
"either",
"end",
"ended",
"ending",
"ends",
"enough",
"even",
"evenly",
"ever",
"every",
"everybody",
"everyone",
"everything",
"everywhere",
"f",
"face",
"faces",
"fact",
"facts",
"far",
"felt",
"few",
"find",
"finds",
"first",
"for",
"four",
"from",
"full",
"fully",
"further",
"furthered",
"furthering",
"furthers",
"g",
"gave",
"general",
"generally",
"get",
"gets",
"give",
"given",
"gives",
"go",
"going",
"good",
"goods",
"got",
"great",
"greater",
"greatest",
"group",
"grouped",
"grouping",
"groups",
"h",
"had",
"has",
"have",
"having",
"he",
"her",
"herself",
"here",
"high",
"higher",
"highest",
"him",
"himself",
"his",
"how",
"however",
"i",
"if",
"important",
"in",
"interest",
"interested",
"interesting",
"interests",
"into",
"is",
"it",
"its",
"itself",
"j",
"just",
"k",
"keep",
"keeps",
"kind",
"knew",
"know",
"known",
"knows",
"l",
"large",
"largely",
"last",
"later",
"latest",
"least",
"less",
"let",
"lets",
"like",
"likely",
"long",
"longer",
"longest",
"m",
"made",
"make",
"making",
"man",
"many",
"may",
"me",
"member",
"members",
"men",
"might",
"more",
"most",
"mostly",
"mr",
"mrs",
"much",
"must",
"my",
"myself",
"n",
"necessary",
"need",
"needed",
"needing",
"needs",
"never",
"new",
"newer",
"newest",
"next",
"no",
"non",
"not",
"nobody",
"noone",
"nothing",
"now",
"nowhere",
"number",
"numbers",
"o",
"of",
"off",
"often",
"old",
"older",
"oldest",
"on",
"once",
"one",
"only",
"open",
"opened",
"opening",
"opens",
"or",
"order",
"ordered",
"ordering",
"orders",
"other",
"others",
"our",
"out",
"over",
"p",
"part",
"parted",
"parting",
"parts",
"per",
"perhaps",
"place",
"places",
"point",
"pointed",
"pointing",
"points",
"possible",
"present",
"presented",
"presenting",
"presents",
"problem",
"problems",
"put",
"puts",
"q",
"quite",
"r",
"rather",
"really",
"right",
"room",
"rooms",
"s",
"said",
"same",
"saw",
"say",
"says",
"second",
"seconds",
"see",
"sees",
"seem",
"seemed",
"seeming",
"seems",
"several",
"shall",
"she",
"should",
"show",
"showed",
"showing",
"shows",
"side",
"sides",
"since",
"small",
"smaller",
"smallest",
"so",
"some",
"somebody",
"someone",
"something",
"somewhere",
"state",
"states",
"still",
"such",
"sure",
"t",
"take",
"taken",
"than",
"that",
"the",
"their",
"them",
"then",
"there",
"therefore",
"these",
"they",
"thing",
"things",
"think",
"thinks",
"this",
"those",
"though",
"thought",
"thoughts",
"three",
"through",
"thus",
"to",
"today",
"together",
"too",
"took",
"toward",
"turn",
"turned",
"turning",
"turns",
"two",
"u",
"under",
"until",
"up",
"upon",
"us",
"use",
"uses",
"used",
"v",
"very",
"w",
"want",
"wanted",
"wanting",
"wants",
"was",
"way",
"ways",
"we",
"well",
"wells",
"went",
"were",
"what",
"when",
"where",
"whether",
"which",
"while",
"who",
"whole",
"whose",
"why",
"will",
"with",
"within",
"without",
"work",
"worked",
"working",
"works",
"would",
"y",
"year",
"years",
"yet",
"you",
"young",
"younger",
"youngest",
"your",
"yours",
"eoc",
"mu",
"sigma",
"mu sigma",
"musigma",
"client",
"clients",
"capabilities",
"capability",
"firm",
"firms",
"biggest",
"-"
]
};
}
});
// src/index.js
import _ from "lodash";
const { fromPairs, sortBy, toPairs } = _;
var stopwords = require_stopwords();
function isNumber(str) {
return /\d/.test(str);
}
function isAcceptable(phrase, minCharLength, maxWordsLength) {
if (phrase < minCharLength) {
return false;
}
let words = phrase.split(" ");
if (words.length > maxWordsLength) {
return false;
}
let digits = 0;
let alpha = 0;
for(let i = 0; i < phrase.length; i++){
if (/\d/.test(phrase[i])) digits += 1;
if (/[a-zA-Z]/.test(phrase[i])) alpha += 1;
}
if (alpha == 0) {
return false;
}
if (digits > alpha) {
return false;
}
return true;
}
function countOccurances(haystack, needle) {
return haystack.reduce((n, value)=>{
return n + (value === needle);
}, 0);
}
function generateCandidateKeywordScores(phraseList, wordScore, minKeywordFrequency = 1) {
let keywordCandidates = {};
phraseList.forEach((phrase)=>{
if (minKeywordFrequency > 1) {
if (countOccurances(phraseList, phrase) < minKeywordFrequency) {
return;
}
}
phrase in keywordCandidates || (keywordCandidates[phrase] = 0);
let wordList = separateWords(phrase, 0);
let candidateScore = 0;
wordList.forEach((word)=>{
candidateScore += wordScore[word];
keywordCandidates[phrase] = candidateScore;
});
});
return keywordCandidates;
}
function separateWords(text, minWordReturnSize) {
let wordDelimiters = /[^a-zA-Z0-9_\+\-/]/;
let words = [];
text.split(wordDelimiters).forEach((singleWord)=>{
let currentWord = singleWord.trim().toLowerCase();
if (currentWord.length > minWordReturnSize && currentWord != "" && !isNumber(currentWord)) {
words.push(currentWord);
}
});
return words;
}
function calculateWordScores(phraseList) {
let wordFrequency = {};
let wordDegree = {};
phraseList.forEach((phrase)=>{
let wordList = separateWords(phrase, 0);
let wordListLength = wordList.length;
let wordListDegree = wordListLength - 1;
wordList.forEach((word)=>{
word in wordFrequency || (wordFrequency[word] = 0);
wordFrequency[word] += 1;
word in wordDegree || (wordDegree[word] = 0);
wordDegree[word] += wordListDegree;
});
});
Object.keys(wordFrequency).forEach((item)=>{
wordDegree[item] = wordDegree[item] + wordFrequency[item];
});
let wordScore = {};
Object.keys(wordFrequency).forEach((item)=>{
item in wordScore || (wordScore[item] = 0);
wordScore[item] = wordDegree[item] / (wordFrequency[item] * 1);
});
return wordScore;
}
function generateCandidateKeywords(sentenceList, stopWordPattern, minCharLength = 1, maxWordsLength = 5) {
let phraseList = [];
sentenceList.forEach((sentence)=>{
let tmp = stopWordPattern[Symbol.replace](sentence, "|");
let phrases = tmp.split("|");
phrases.forEach((ph)=>{
let phrase = ph.trim().toLowerCase();
if (phrase != "" && isAcceptable(phrase, minCharLength, maxWordsLength)) {
phraseList.push(phrase);
} else {}
});
});
return phraseList;
}
function buildStopWordRegex(path) {
let stopWordList = loadStopWords(path);
let stopWordRegexList = [];
stopWordList.forEach((word)=>{
if (/\w+/.test(word)) {
let wordRegex = `\\b${word}\\b`;
stopWordRegexList.push(wordRegex);
}
});
let stopWordPattern = new RegExp(stopWordRegexList.join("|"), "ig");
return stopWordPattern;
}
function splitSentences(text) {
let sentenceDelimiters = /[\[\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/;
return text.split(sentenceDelimiters);
}
function loadStopWords(path) {
let contents = stopwords.stopwords;
return contents;
}
function rake(text, stopWordsPath, minCharLength = 3, maxWordsLength = 5, minKeywordFrequency = 1) {
let stopWordPattern = buildStopWordRegex(stopWordsPath);
let sentenceList = splitSentences(text);
let phraseList = generateCandidateKeywords(sentenceList, stopWordPattern, minCharLength, maxWordsLength);
let wordScores = calculateWordScores(phraseList);
let keywordCandidates = generateCandidateKeywordScores(phraseList, wordScores, minKeywordFrequency);
let sortedKeywords = fromPairs(sortBy(toPairs(keywordCandidates), (pair)=>pair[1]).reverse());
return sortedKeywords;
}
export { buildStopWordRegex, calculateWordScores, countOccurances, rake as default, generateCandidateKeywordScores, generateCandidateKeywords, isAcceptable, loadStopWords, separateWords, splitSentences };