llm-prepare
Version:
A utility designed to streamline the preparation of diverse text sources for Large Language Model (LLM) consumption. It intelligently flattens project structures, truncating, and formatting for ICL prompts.
286 lines (232 loc) • 8.89 kB
JavaScript
/**
* Integration tests for format conversion and truncation together
*/
import { describe, test, expect, beforeAll, afterAll } from '@jest/globals';
import { processText } from '../../src/index.js';
import { convertFormat } from '../../src/formatters/format-converter.js';
import { truncateText } from '../../src/processors/truncate.js';
import { detectFormat } from '../../src/utils/format-detector.js';
import { estimateTokenCount } from '../../src/utils/token-counter.js';
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
// Get directory name
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Test files
const testFilesDir = path.join(__dirname, '../fixtures/format-truncate');
const markdownFile = path.join(testFilesDir, 'sample.md');
const htmlFile = path.join(testFilesDir, 'sample.html');
const textFile = path.join(testFilesDir, 'sample.txt');
const outputFile = path.join(testFilesDir, 'output.txt');
// Create test files
beforeAll(async () => {
await fs.mkdir(testFilesDir, { recursive: true });
// Create sample markdown file
await fs.writeFile(markdownFile, `
This is a **sample** document to test *format conversion* and truncation.
- Markdown formatting
- HTML tags
- Plain text
\`\`\`javascript
function hello() {
console.log("Hello, world!");
}
\`\`\`
[ ](https://example.com)
`);
// Create sample HTML file
await fs.writeFile(htmlFile, `<!DOCTYPE html>
<html>
<head>
<title>Sample HTML Document</title>
</head>
<body>
<h1>Sample HTML Document</h1>
<h2>Introduction</h2>
<p>This is a <strong>sample</strong> document to test <em>format conversion</em> and truncation.</p>
<h2>Features</h2>
<ul>
<li>Markdown formatting</li>
<li>HTML tags</li>
<li>Plain text</li>
</ul>
<h2>Code Sample</h2>
<pre><code>function hello() {
console.log("Hello, world!");
}</code></pre>
<h2>Links</h2>
<p><a href="https://example.com">Example link</a></p>
</body>
</html>
`);
// Create sample text file
await fs.writeFile(textFile, `SAMPLE TEXT DOCUMENT
INTRODUCTION
This is a sample document to test format conversion and truncation.
FEATURES
* Markdown formatting
* HTML tags
* Plain text
CODE SAMPLE
function hello() {
console.log("Hello, world!");
}
LINKS
Example link: https://example.com
`);
});
// Clean up test files
afterAll(async () => {
try {
await fs.rm(testFilesDir, { recursive: true });
} catch (error) {
console.error('Failed to clean up test files:', error);
}
});
describe('Format conversion with truncation', () => {
test('correctly detects file formats', async () => {
const mdContent = await fs.readFile(markdownFile, 'utf8');
const htmlContent = await fs.readFile(htmlFile, 'utf8');
const textContent = await fs.readFile(textFile, 'utf8');
expect(detectFormat(mdContent)).toBe('markdown');
expect(detectFormat(htmlContent)).toBe('html');
expect(detectFormat(textContent)).toBe('text');
});
test('converts markdown to text and then truncates from end', async () => {
const outputCapture = jest.fn();
const originalStdoutWrite = process.stdout.write;
process.stdout.write = outputCapture;
try {
await processText({
input: markdownFile,
format: 'text',
maxTokens: 20,
truncate: 'end'
});
const output = outputCapture.mock.calls.map(call => call[0]).join('');
// Should contain start content but not end content
expect(output).toContain('SAMPLE MARKDOWN DOCUMENT');
expect(output).toContain('INTRODUCTION');
expect(output).not.toContain('LINKS');
expect(output).not.toContain('Example link');
// Should contain truncation indicator
expect(output).toContain('[...Content truncated from end...]');
// Check token count is approximately what we expect
expect(estimateTokenCount(output)).toBeLessThanOrEqual(25); // Allow small buffer for truncation message
} finally {
process.stdout.write = originalStdoutWrite;
}
});
test('converts HTML to markdown and then truncates from start', async () => {
await processText({
input: htmlFile,
format: 'markdown',
maxTokens: 25,
truncate: 'start',
output: outputFile
});
const outputContent = await fs.readFile(outputFile, 'utf8');
// Should contain end content but not start content
expect(outputContent).not.toContain('# Sample HTML Document');
expect(outputContent).not.toContain('## Introduction');
expect(outputContent).toContain('## Links');
expect(outputContent).toContain('[Example link](https://example.com)');
// Should contain truncation indicator
expect(outputContent).toContain('[...Content truncated from beginning...]');
// Check token count is approximately what we expect
expect(estimateTokenCount(outputContent)).toBeLessThanOrEqual(30); // Allow small buffer for truncation message
});
test('converts text to HTML and then truncates from middle', async () => {
const outputCapture = jest.fn();
const originalStdoutWrite = process.stdout.write;
process.stdout.write = outputCapture;
try {
await processText({
input: textFile,
format: 'html',
maxTokens: 30,
truncate: 'middle'
});
const output = outputCapture.mock.calls.map(call => call[0]).join('');
// Should contain both start and end content
expect(output).toContain('<pre>SAMPLE TEXT DOCUMENT');
expect(output).toContain('LINKS');
expect(output).toContain('Example link');
// Should contain truncation indicator
expect(output).toContain('[...Content truncated from middle...]');
// Should not contain middle content
expect(output).not.toContain('CODE SAMPLE');
// Check token count is approximately what we expect
expect(estimateTokenCount(output)).toBeLessThanOrEqual(35); // Allow small buffer for truncation message
} finally {
process.stdout.write = originalStdoutWrite;
}
});
test('handles format conversion then truncation in a single command', async () => {
// First perform conversion and truncation separately
const mdContent = await fs.readFile(markdownFile, 'utf8');
const converted = await convertFormat(mdContent, 'html');
const truncated = truncateText(converted, 25, 'end');
// Then perform the same operation using processText
const outputCapture = jest.fn();
const originalStdoutWrite = process.stdout.write;
process.stdout.write = outputCapture;
try {
await processText({
input: markdownFile,
format: 'html',
maxTokens: 25,
truncate: 'end'
});
const output = outputCapture.mock.calls.map(call => call[0]).join('');
// Both should have similar token counts
const separateCount = estimateTokenCount(truncated);
const combinedCount = estimateTokenCount(output);
expect(Math.abs(separateCount - combinedCount)).toBeLessThanOrEqual(5);
// Both should have truncation indicators
expect(truncated).toContain('[...Content truncated from end...]');
expect(output).toContain('[...Content truncated from end...]');
} finally {
process.stdout.write = originalStdoutWrite;
}
});
test('handles invalid format with valid truncation', async () => {
// Should throw error for invalid format but not for truncation
await expect(processText({
input: markdownFile,
format: 'invalid-format',
maxTokens: 20,
truncate: 'end'
})).rejects.toThrow('Unsupported conversion');
});
test('handles valid format with invalid truncation', async () => {
// Should throw error for invalid truncation strategy
await expect(processText({
input: markdownFile,
format: 'text',
maxTokens: 20,
truncate: 'invalid-strategy'
})).rejects.toThrow('Invalid truncation strategy');
});
test('preserves formatting structure after truncation', async () => {
await processText({
input: htmlFile,
format: 'markdown',
maxTokens: 40,
truncate: 'end',
output: outputFile
});
const outputContent = await fs.readFile(outputFile, 'utf8');
// Check if headings are still formatted correctly
expect(outputContent).toMatch(/^
// Check if bold formatting is preserved
expect(outputContent).toMatch(/\*\*sample\*\*/);
// Verify markdown structure is intact
expect(detectFormat(outputContent)).toBe('markdown');
});
});