UNPKG

llamaindex

Version:

<p align="center"> <img height="100" width="100" alt="LlamaIndex logo" src="https://ts.llamaindex.ai/square.svg" /> </p> <h1 align="center">LlamaIndex.TS</h1> <h3 align="center"> Data framework for your LLM application. </h3>

584 lines (583 loc) 16.1 kB
// generate from "tsup ./src/index.js --format esm" var __getOwnPropNames = Object.getOwnPropertyNames; var __commonJS = (cb, mod)=>function __require() { return mod || (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), mod.exports; }; // src/stopwords.js var require_stopwords = __commonJS({ "src/stopwords.js" (exports, module) { "use strict"; module.exports = { stopwords: [ "a", "about", "above", "across", "after", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "among", "an", "and", "another", "any", "anybody", "anyone", "anything", "anywhere", "are", "area", "areas", "around", "as", "ask", "asked", "asking", "asks", "at", "away", "b", "back", "backed", "backing", "backs", "be", "because", "become", "becomes", "became", "been", "before", "began", "behind", "being", "beings", "best", "better", "between", "big", "both", "but", "by", "c", "came", "can", "cannot", "case", "cases", "certain", "certainly", "clear", "clearly", "come", "contains", "could", "d", "did", "differ", "different", "differently", "do", "does", "done", "down", "downed", "downing", "downs", "during", "e", "each", "early", "either", "end", "ended", "ending", "ends", "enough", "even", "evenly", "ever", "every", "everybody", "everyone", "everything", "everywhere", "f", "face", "faces", "fact", "facts", "far", "felt", "few", "find", "finds", "first", "for", "four", "from", "full", "fully", "further", "furthered", "furthering", "furthers", "g", "gave", "general", "generally", "get", "gets", "give", "given", "gives", "go", "going", "good", "goods", "got", "great", "greater", "greatest", "group", "grouped", "grouping", "groups", "h", "had", "has", "have", "having", "he", "her", "herself", "here", "high", "higher", "highest", "him", "himself", "his", "how", "however", "i", "if", "important", "in", "interest", "interested", "interesting", "interests", "into", "is", "it", "its", "itself", "j", "just", "k", "keep", "keeps", "kind", "knew", "know", "known", "knows", "l", "large", "largely", "last", "later", "latest", "least", "less", "let", "lets", "like", "likely", "long", "longer", "longest", "m", "made", "make", "making", "man", "many", "may", "me", "member", "members", "men", "might", "more", "most", "mostly", "mr", "mrs", "much", "must", "my", "myself", "n", "necessary", "need", "needed", "needing", "needs", "never", "new", "newer", "newest", "next", "no", "non", "not", "nobody", "noone", "nothing", "now", "nowhere", "number", "numbers", "o", "of", "off", "often", "old", "older", "oldest", "on", "once", "one", "only", "open", "opened", "opening", "opens", "or", "order", "ordered", "ordering", "orders", "other", "others", "our", "out", "over", "p", "part", "parted", "parting", "parts", "per", "perhaps", "place", "places", "point", "pointed", "pointing", "points", "possible", "present", "presented", "presenting", "presents", "problem", "problems", "put", "puts", "q", "quite", "r", "rather", "really", "right", "room", "rooms", "s", "said", "same", "saw", "say", "says", "second", "seconds", "see", "sees", "seem", "seemed", "seeming", "seems", "several", "shall", "she", "should", "show", "showed", "showing", "shows", "side", "sides", "since", "small", "smaller", "smallest", "so", "some", "somebody", "someone", "something", "somewhere", "state", "states", "still", "such", "sure", "t", "take", "taken", "than", "that", "the", "their", "them", "then", "there", "therefore", "these", "they", "thing", "things", "think", "thinks", "this", "those", "though", "thought", "thoughts", "three", "through", "thus", "to", "today", "together", "too", "took", "toward", "turn", "turned", "turning", "turns", "two", "u", "under", "until", "up", "upon", "us", "use", "uses", "used", "v", "very", "w", "want", "wanted", "wanting", "wants", "was", "way", "ways", "we", "well", "wells", "went", "were", "what", "when", "where", "whether", "which", "while", "who", "whole", "whose", "why", "will", "with", "within", "without", "work", "worked", "working", "works", "would", "y", "year", "years", "yet", "you", "young", "younger", "youngest", "your", "yours", "eoc", "mu", "sigma", "mu sigma", "musigma", "client", "clients", "capabilities", "capability", "firm", "firms", "biggest", "-" ] }; } }); // src/index.js import _ from "lodash"; const { fromPairs, sortBy, toPairs } = _; var stopwords = require_stopwords(); function isNumber(str) { return /\d/.test(str); } function isAcceptable(phrase, minCharLength, maxWordsLength) { if (phrase < minCharLength) { return false; } let words = phrase.split(" "); if (words.length > maxWordsLength) { return false; } let digits = 0; let alpha = 0; for(let i = 0; i < phrase.length; i++){ if (/\d/.test(phrase[i])) digits += 1; if (/[a-zA-Z]/.test(phrase[i])) alpha += 1; } if (alpha == 0) { return false; } if (digits > alpha) { return false; } return true; } function countOccurances(haystack, needle) { return haystack.reduce((n, value)=>{ return n + (value === needle); }, 0); } function generateCandidateKeywordScores(phraseList, wordScore, minKeywordFrequency = 1) { let keywordCandidates = {}; phraseList.forEach((phrase)=>{ if (minKeywordFrequency > 1) { if (countOccurances(phraseList, phrase) < minKeywordFrequency) { return; } } phrase in keywordCandidates || (keywordCandidates[phrase] = 0); let wordList = separateWords(phrase, 0); let candidateScore = 0; wordList.forEach((word)=>{ candidateScore += wordScore[word]; keywordCandidates[phrase] = candidateScore; }); }); return keywordCandidates; } function separateWords(text, minWordReturnSize) { let wordDelimiters = /[^a-zA-Z0-9_\+\-/]/; let words = []; text.split(wordDelimiters).forEach((singleWord)=>{ let currentWord = singleWord.trim().toLowerCase(); if (currentWord.length > minWordReturnSize && currentWord != "" && !isNumber(currentWord)) { words.push(currentWord); } }); return words; } function calculateWordScores(phraseList) { let wordFrequency = {}; let wordDegree = {}; phraseList.forEach((phrase)=>{ let wordList = separateWords(phrase, 0); let wordListLength = wordList.length; let wordListDegree = wordListLength - 1; wordList.forEach((word)=>{ word in wordFrequency || (wordFrequency[word] = 0); wordFrequency[word] += 1; word in wordDegree || (wordDegree[word] = 0); wordDegree[word] += wordListDegree; }); }); Object.keys(wordFrequency).forEach((item)=>{ wordDegree[item] = wordDegree[item] + wordFrequency[item]; }); let wordScore = {}; Object.keys(wordFrequency).forEach((item)=>{ item in wordScore || (wordScore[item] = 0); wordScore[item] = wordDegree[item] / (wordFrequency[item] * 1); }); return wordScore; } function generateCandidateKeywords(sentenceList, stopWordPattern, minCharLength = 1, maxWordsLength = 5) { let phraseList = []; sentenceList.forEach((sentence)=>{ let tmp = stopWordPattern[Symbol.replace](sentence, "|"); let phrases = tmp.split("|"); phrases.forEach((ph)=>{ let phrase = ph.trim().toLowerCase(); if (phrase != "" && isAcceptable(phrase, minCharLength, maxWordsLength)) { phraseList.push(phrase); } else {} }); }); return phraseList; } function buildStopWordRegex(path) { let stopWordList = loadStopWords(path); let stopWordRegexList = []; stopWordList.forEach((word)=>{ if (/\w+/.test(word)) { let wordRegex = `\\b${word}\\b`; stopWordRegexList.push(wordRegex); } }); let stopWordPattern = new RegExp(stopWordRegexList.join("|"), "ig"); return stopWordPattern; } function splitSentences(text) { let sentenceDelimiters = /[\[\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/; return text.split(sentenceDelimiters); } function loadStopWords(path) { let contents = stopwords.stopwords; return contents; } function rake(text, stopWordsPath, minCharLength = 3, maxWordsLength = 5, minKeywordFrequency = 1) { let stopWordPattern = buildStopWordRegex(stopWordsPath); let sentenceList = splitSentences(text); let phraseList = generateCandidateKeywords(sentenceList, stopWordPattern, minCharLength, maxWordsLength); let wordScores = calculateWordScores(phraseList); let keywordCandidates = generateCandidateKeywordScores(phraseList, wordScores, minKeywordFrequency); let sortedKeywords = fromPairs(sortBy(toPairs(keywordCandidates), (pair)=>pair[1]).reverse()); return sortedKeywords; } export { buildStopWordRegex, calculateWordScores, countOccurances, rake as default, generateCandidateKeywordScores, generateCandidateKeywords, isAcceptable, loadStopWords, separateWords, splitSentences };