web-ai-toolkit
Version:
AI powered features on the web made easy
123 lines (104 loc) • 4.47 kB
text/typescript
/**
* Shared utilities for summarization services
*/
export function splitTextIntoChunks(text: string, maxChunkLength: number, overlap: number, minChunkLength: number): string[] {
// If text is shorter than max length, return as single chunk
if (text.length <= maxChunkLength) {
return [text];
}
const chunks: string[] = [];
let currentPosition = 0;
while (currentPosition < text.length) {
const chunkEnd = currentPosition + maxChunkLength;
// If we're at the end of the text, take the rest
if (chunkEnd >= text.length) {
const remainingText = text.substring(currentPosition).trim();
if (remainingText.length >= minChunkLength || chunks.length === 0) {
chunks.push(remainingText);
} else if (chunks.length > 0) {
// Merge with previous chunk if remaining text is too short
chunks[chunks.length - 1] += ' ' + remainingText;
}
break;
}
// Try to find a good breaking point (sentence end, then paragraph, then space)
let breakPoint = chunkEnd;
// Look backwards for sentence endings
for (let i = chunkEnd; i > currentPosition + Math.min(minChunkLength, maxChunkLength * 0.5); i--) {
if (text[i] === '.' || text[i] === '!' || text[i] === '?') {
// Check if it's followed by whitespace or end of text, and not likely an abbreviation
if (i + 1 >= text.length || /\s/.test(text[i + 1])) {
// Simple check to avoid breaking on abbreviations like "Mr.", "Dr.", etc.
const beforePeriod = text.substring(Math.max(0, i - 10), i);
if (!(text[i] === '.' && /\b[A-Z][a-z]?$/.test(beforePeriod.trim()))) {
breakPoint = i + 1;
break;
}
}
}
}
// If no sentence break found, look for paragraph breaks
if (breakPoint === chunkEnd) {
for (let i = chunkEnd; i > currentPosition + Math.min(minChunkLength, maxChunkLength * 0.5); i--) {
if (text[i] === '\n' && (i + 1 < text.length && text[i + 1] === '\n')) {
breakPoint = i + 2;
break;
}
}
}
// If no paragraph break found, look for any whitespace
if (breakPoint === chunkEnd) {
for (let i = chunkEnd; i > currentPosition + Math.min(minChunkLength, maxChunkLength * 0.5); i--) {
if (/\s/.test(text[i])) {
breakPoint = i + 1;
break;
}
}
}
// Extract the chunk and trim whitespace
const chunk = text.substring(currentPosition, breakPoint).trim();
if (chunk.length >= minChunkLength || chunks.length === 0) {
chunks.push(chunk);
} else if (chunks.length > 0) {
// Merge with previous chunk if current chunk is too short
chunks[chunks.length - 1] += ' ' + chunk;
}
// Move position forward, accounting for overlap but ensuring progress
const nextPosition = Math.max(breakPoint - overlap, currentPosition + 1);
currentPosition = nextPosition;
}
// Filter out any chunks that are too short or empty
return chunks.filter(chunk => chunk.length > 0);
}
/**
* Generic chunked summarization function - assumes text is already long enough to need chunking
*/
export async function processChunkedSummarization<T>(
text: string,
maxChunkLength: number,
overlap: number,
minChunkLength: number,
onProgress: ((progress: number, message: string) => void) | undefined,
summarizeFunction: (chunk: string) => Promise<T>,
extractTextFunction: (summary: T) => string,
createReturnValue: (combinedText: string) => T,
): Promise<T> {
// Split text into chunks and summarize
const chunks = splitTextIntoChunks(text, maxChunkLength, overlap, minChunkLength);
const chunkSummaries: string[] = [];
onProgress?.(0, `Processing ${chunks.length} chunks...`);
// Process chunks sequentially
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const summary = await summarizeFunction(chunk);
chunkSummaries.push(extractTextFunction(summary));
const progress = ((i + 1) / chunks.length) * 0.9;
onProgress?.(progress, `Processed ${i + 1} of ${chunks.length} chunks`);
}
// Combine all summaries and return in the expected format
onProgress?.(0.95, 'Combining summaries...');
const combinedSummary = chunkSummaries.join(' ');
onProgress?.(1.0, 'Summarization complete!');
// Return the combined summary in the expected format
return createReturnValue(combinedSummary);
}