UNPKG

generator-begcode

Version:

Spring Boot + Angular/React/Vue in one handy generator

www.begcode.com

begcode/generator-begcode

124 lines (123 loc) • 5.35 kB

JavaScript

import { cleanWhitespace } from './cleanWhitespace.js'; export class TextChunker { static fixedCharacterLength(text, opts) { const { chunkLength, overlap } = opts; if (chunkLength <= overlap) { throw new Error('Chunk length must be greater than overlap length.'); } const chunks = []; let startIndex = 0; while (startIndex < text.length) { const endIndex = startIndex + chunkLength; const chunk = text.slice(startIndex, endIndex); chunks.push(chunk); startIndex = endIndex - overlap; } return chunks; } static parentDocRetrieval(text, opts) { const { parentChunker, childChunker } = opts; const parentChunks = parentChunker(text); const parentAndChildDocs = parentChunks .map(parentChunk => { return { parent: parentChunk, children: childChunker(parentChunk), }; }) .map(parentAndChildDoc => { return { metadatas: parentAndChildDoc.children.map(() => ({ parent: parentAndChildDoc.parent })), documents: parentAndChildDoc.children, }; }); const docs = parentAndChildDocs.flatMap(parentAndChildDoc => parentAndChildDoc.documents); const metadatas = parentAndChildDocs.flatMap(parentAndChildDoc => parentAndChildDoc.metadatas); const chunks = docs.map((doc, index) => { return { doc, metadata: metadatas[index], }; }); return chunks; } static sentences(text) { const alphabets = '([A-Za-z])'; const prefixes = '(Mr|St|Mrs|Ms|Dr)[.]'; const suffixes = '(Inc|Ltd|Jr|Sr|Co)'; const starters = '(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever)'; const acronyms = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'; const websites = '[.](com|net|org|io|gov|edu|me)'; const digits = '([0-9])'; const multipleDots = /\\.{2,}/g; text = ` ${text} `; text = text.replaceAll('\n', ' '); text = text.replaceAll(new RegExp(prefixes, 'g'), '$1<prd>'); text = text.replaceAll(new RegExp(websites, 'g'), '<prd>$1'); text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2'); text = text.replaceAll(multipleDots, match => `${'<prd>'.repeat(match.length)}<stop>`); if (text.includes('Ph.D')) text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>'); text = text.replaceAll(new RegExp(`\\s${alphabets}[.] `, 'g'), ' $1<prd> '); text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2'); text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>$3<prd>'); text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>'); text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), ' $1<stop> $2'); text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), ' $1<prd>'); text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), ' $1<prd>'); if (text.includes('”')) text = text.replaceAll('.”', '”.'); if (text.includes('"')) text = text.replaceAll('."', '".'); if (text.includes('!')) text = text.replaceAll('!"', '"!'); if (text.includes('?')) text = text.replaceAll('?"', '"?'); text = text.replaceAll('.', '.<stop>'); text = text.replaceAll('?', '?<stop>'); text = text.replaceAll('!', '!<stop>'); text = text.replaceAll('<prd>', '.'); const sentences = text.split('<stop>').map(s => s.trim()); if (sentences && !sentences[sentences.length - 1]) sentences.pop(); return sentences; } static singleLine(content) { const lines = cleanWhitespace(content).split('\n'); return lines; } static multiLines(content, linesPerChunk) { const lines = cleanWhitespace(content).split('\n'); const chunks = []; for (let i = 0; i < lines.length; i += linesPerChunk) { chunks.push(lines.slice(i, i + linesPerChunk).join('\n')); } return chunks; } static characters(content, characterLimit) { const trimmedContent = cleanWhitespace(content); const chunks = []; let currentChunk = ''; for (const char of trimmedContent) { if (currentChunk.length + 1 > characterLimit) { chunks.push(currentChunk); currentChunk = ''; } currentChunk += char; } return chunks; } static words(content, characterLimit) { const trimmedContent = cleanWhitespace(content); const chunks = []; let currentChunk = ''; for (const word of trimmedContent.split(' ')) { if (currentChunk.length + word.length > characterLimit) { chunks.push(currentChunk); currentChunk = ''; } currentChunk += word; } return chunks.filter(x => x.length > 0); } }