@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
256 lines (255 loc) • 10.3 kB
JavaScript
/**
* File Summarization Service
*
* Orchestrates the end-to-end file summarization pipeline:
* 1. Accept raw file inputs (strings or Buffers)
* 2. Extract readable text and estimate tokens
* 3. Use `planFileSummarization()` to decide which files to summarize
* 4. Call an LLM to produce context-aware summaries of the largest files
* 5. Fall back to truncation when the LLM call fails
*
* The LLM is instantiated via a *dynamic import* of NeuroLink to avoid
* circular dependency issues (NeuroLink → fileSummarizationService → NeuroLink).
*/
import { estimateTokens } from "../utils/tokenEstimation.js";
import { buildFileSummarizationPrompt, planFileSummarization, } from "./fileSummarizer.js";
// ---------------------------------------------------------------------------
// MIME → human label mapping
// ---------------------------------------------------------------------------
const MIME_LABEL_MAP = {
"application/pdf": "PDF Document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word Document",
"application/msword": "Word Document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel Spreadsheet",
"application/vnd.ms-excel": "Excel Spreadsheet",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "PowerPoint Presentation",
"application/vnd.ms-powerpoint": "PowerPoint Presentation",
"application/json": "JSON File",
"application/xml": "XML File",
"text/xml": "XML File",
"text/html": "HTML Document",
"text/css": "CSS Stylesheet",
"text/csv": "CSV File",
"text/plain": "Text File",
"text/markdown": "Markdown Document",
"application/javascript": "JavaScript File",
"text/javascript": "JavaScript File",
"application/typescript": "TypeScript File",
"text/typescript": "TypeScript File",
"application/yaml": "YAML File",
"text/yaml": "YAML File",
"image/svg+xml": "SVG Image",
"application/rtf": "RTF Document",
"text/rtf": "RTF Document",
"application/zip": "ZIP Archive",
"application/gzip": "GZip Archive",
};
/** Binary MIME type prefixes that cannot be meaningfully extracted as text. */
const BINARY_MIME_PREFIXES = ["image/", "audio/", "video/"];
// ---------------------------------------------------------------------------
// Service
// ---------------------------------------------------------------------------
export class FileSummarizationService {
provider;
model;
constructor(options) {
this.provider = options?.provider ?? "vertex";
this.model = options?.model ?? "gemini-2.5-flash";
}
// -------------------------------------------------------------------------
// Text extraction
// -------------------------------------------------------------------------
/**
* Extract readable text from a file's content.
*
* - Strings are returned as-is.
* - Buffers are decoded as UTF-8 when the MIME type is textual.
* - Known-binary types (image/*, audio/*, video/*) return a placeholder.
*/
extractFileText(content, mimeType, fileName) {
// String content — already text
if (typeof content === "string") {
return content;
}
// Binary MIME types → placeholder
const isBinary = BINARY_MIME_PREFIXES.some((prefix) => mimeType.startsWith(prefix));
if (isBinary) {
return `[Binary file: ${fileName} (${mimeType}, ${content.length} bytes)]`;
}
// Buffer with text-like MIME → decode as UTF-8
try {
return content.toString("utf-8");
}
catch {
return `[Binary file: ${fileName} (${mimeType}, ${content.length} bytes)]`;
}
}
// -------------------------------------------------------------------------
// MIME → label
// -------------------------------------------------------------------------
/**
* Map a MIME type (and filename for fallback) to a human-readable label.
*/
getFileTypeLabel(mimeType, fileName) {
// Direct lookup
if (MIME_LABEL_MAP[mimeType]) {
return MIME_LABEL_MAP[mimeType];
}
// Extension-based fallback
const ext = fileName.split(".").pop()?.toLowerCase();
switch (ext) {
case "ts":
case "tsx":
return "TypeScript File";
case "js":
case "jsx":
return "JavaScript File";
case "py":
return "Python File";
case "java":
return "Java File";
case "go":
return "Go File";
case "rs":
return "Rust File";
case "rb":
return "Ruby File";
case "php":
return "PHP File";
case "c":
case "h":
return "C File";
case "cpp":
case "hpp":
case "cc":
return "C++ File";
case "cs":
return "C# File";
case "swift":
return "Swift File";
case "kt":
return "Kotlin File";
case "md":
return "Markdown Document";
case "yaml":
case "yml":
return "YAML File";
case "toml":
return "TOML File";
case "ini":
case "cfg":
return "Config File";
case "sh":
case "bash":
return "Shell Script";
case "sql":
return "SQL File";
case "csv":
return "CSV File";
case "json":
return "JSON File";
case "xml":
return "XML File";
case "html":
case "htm":
return "HTML Document";
default:
return "File";
}
}
// -------------------------------------------------------------------------
// Preparation
// -------------------------------------------------------------------------
/**
* Convert an array of raw file inputs into `FileForSummarization` objects.
*
* Extracts text and estimates token count for each file.
*/
prepareFilesForSummarization(files, provider) {
const effectiveProvider = provider ?? this.provider;
return files.map((file) => {
const text = this.extractFileText(file.content, file.mimeType, file.fileName);
const estimatedTokens = estimateTokens(text, effectiveProvider);
const fileType = this.getFileTypeLabel(file.mimeType, file.fileName);
return {
fileName: file.fileName,
fileType,
content: text,
estimatedTokens,
mimeType: file.mimeType,
originalSize: file.originalSize,
};
});
}
// -------------------------------------------------------------------------
// Summarization
// -------------------------------------------------------------------------
/**
* Summarize files that exceed the context budget.
*
* For each file marked "summarize" by `planFileSummarization()`, we call
* the configured LLM to produce a context-aware summary. If the LLM call
* fails, we fall back to naive truncation so the request can still proceed.
*/
async summarizeFiles(files, userPrompt, budgetParams) {
const plan = planFileSummarization(files, budgetParams);
const results = [];
for (const entry of plan) {
if (entry.action === "keep") {
results.push({
fileName: entry.file.fileName,
fileType: entry.file.fileType,
summary: entry.file.content,
originalTokens: entry.file.estimatedTokens,
summaryTokens: entry.file.estimatedTokens,
wasSummarized: false,
});
continue;
}
// Action is "summarize"
const targetTokens = entry.targetTokens ?? 2000;
try {
// Dynamic import to avoid circular dependency
const { NeuroLink } = await import("../neurolink.js");
const summarizer = new NeuroLink();
const prompt = buildFileSummarizationPrompt({
fileName: entry.file.fileName,
fileType: entry.file.fileType,
fileContent: entry.file.content,
userPrompt,
targetTokens,
});
const result = await summarizer.generate({
input: { text: prompt },
provider: this.provider,
model: this.model,
});
const summaryText = typeof result === "string" ? result : (result?.content ?? "");
const summaryTokens = estimateTokens(summaryText, budgetParams.provider);
results.push({
fileName: entry.file.fileName,
fileType: entry.file.fileType,
summary: summaryText,
originalTokens: entry.file.estimatedTokens,
summaryTokens,
wasSummarized: true,
});
}
catch {
// Fallback: naive truncation
const { truncateToTokenBudget } = await import("../utils/tokenEstimation.js");
const { text: truncated } = truncateToTokenBudget(entry.file.content, targetTokens, budgetParams.provider);
const summaryTokens = estimateTokens(truncated, budgetParams.provider);
results.push({
fileName: entry.file.fileName,
fileType: entry.file.fileType,
summary: truncated,
originalTokens: entry.file.estimatedTokens,
summaryTokens,
wasSummarized: true,
});
}
}
return results;
}
}