semantic-chunking
Version:
Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).
579 lines (493 loc) • 21.5 kB
JavaScript
// set form default values
import defaultFormValues from './default-form-values.js';
// Set default values for all form controls
function setDefaultFormValues() {
// Set range inputs
document.getElementById('maxTokenSize').value = defaultFormValues.maxTokenSize;
document.getElementById('similarityThreshold').value = defaultFormValues.similarityThreshold;
document.getElementById('dynamicThresholdLowerBound').value = defaultFormValues.dynamicThresholdLowerBound;
document.getElementById('dynamicThresholdUpperBound').value = defaultFormValues.dynamicThresholdUpperBound;
document.getElementById('numSimilaritySentencesLookahead').value = defaultFormValues.numSimilaritySentencesLookahead;
document.getElementById('combineChunksSimilarityThreshold').value = defaultFormValues.combineChunksSimilarityThreshold;
// Set checkboxes
document.getElementById('combineChunks').checked = defaultFormValues.combineChunks;
document.getElementById('returnEmbedding').checked = defaultFormValues.returnEmbedding;
document.getElementById('returnTokenLength').checked = defaultFormValues.returnTokenLength;
document.getElementById('excludeChunkPrefixInResults').checked = defaultFormValues.excludeChunkPrefixInResults;
// Set text input
const chunkPrefixInput = document.getElementById('chunkPrefix');
chunkPrefixInput.value = defaultFormValues.chunkPrefix || '';
// Set dtype (convert string to number index)
const dtypeMap = { 'fp32': 0, 'fp16': 1, 'q8': 2, 'q4': 3 };
document.getElementById('dtype').value = dtypeMap[defaultFormValues.dtype] || 0;
// Set device
document.getElementById('device').value = defaultFormValues.device || 'cpu';
// Trigger update for all range inputs to show their values
document.querySelectorAll('input[type="range"]').forEach(input => {
const event = new Event('input');
input.dispatchEvent(event);
});
// Update dependent controls based on combineChunks
updateDependentControls();
}
// Call setDefaultFormValues after the DOM is loaded
document.addEventListener('DOMContentLoaded', setDefaultFormValues);
// Load sample text on page load
fetch('./documents/sample.txt')
.then(response => response.text())
.then(text => {
document.getElementById('documentText').value = text;
})
.catch(error => console.error('Error loading sample text:', error));
// Load and populate model options
fetch('models.json')
.then(response => response.json())
.then(data => {
const select = document.getElementById('onnxEmbeddingModel');
data.models.forEach(model => {
const option = document.createElement('option');
option.value = model.value;
option.textContent = model.label;
select.appendChild(option);
});
// Set default model after options are loaded
select.value = defaultFormValues.onnxEmbeddingModel;
})
.catch(error => console.error('Error loading models:', error));
// Update range input displays
document.querySelectorAll('input[type="range"]').forEach(input => {
const display = input.nextElementSibling;
// Create inner elements if they don't exist
if (!display.querySelector('.number')) {
const number = document.createElement('span');
number.className = 'number';
const description = document.createElement('span');
description.className = 'description';
display.appendChild(number);
display.appendChild(description);
}
// Update similarity display
function updateSimilarityDisplay(value) {
const number = display.querySelector('.number');
const description = display.querySelector('.description');
// Only update similarity descriptions for relevant sliders
const similaritySliders = [
'similarityThreshold',
'combineChunksSimilarityThreshold',
'dynamicThresholdLowerBound',
'dynamicThresholdUpperBound'
];
if (similaritySliders.includes(input.id)) {
const val = parseFloat(value);
let className, text;
if (val < 0.5) {
className = 'similarity-low';
text = 'low similarity';
} else if (val <= 0.7) {
className = 'similarity-moderate';
text = 'moderately similar';
} else {
className = 'similarity-high';
text = 'very similar';
}
number.className = 'number ' + className;
number.textContent = val.toFixed(3);
description.className = 'description ' + className;
description.textContent = text;
} else {
number.textContent = value;
description.textContent = '';
}
}
// Initial update
updateSimilarityDisplay(input.value);
// Update on change
input.addEventListener('input', (e) => updateSimilarityDisplay(e.target.value));
});
const combineChunksToggle = document.getElementById('combineChunks');
const dependentControls = document.querySelectorAll('.depends-on-combine-chunks');
// Update dependent controls
function updateDependentControls() {
const isEnabled = combineChunksToggle.checked;
dependentControls.forEach(control => {
if (isEnabled) {
// Show controls
control.style.display = 'block';
// Use setTimeout to ensure display: block takes effect first
setTimeout(() => {
control.classList.remove('hidden');
const inputs = control.querySelectorAll('input');
inputs.forEach(input => input.disabled = false);
}, 10);
} else {
// Hide controls
control.classList.add('hidden');
const inputs = control.querySelectorAll('input');
inputs.forEach(input => input.disabled = true);
// Remove display: none after transition completes
control.addEventListener('transitionend', function handler() {
if (!combineChunksToggle.checked) {
control.style.display = 'none';
}
control.removeEventListener('transitionend', handler);
});
}
});
}
// Initial state
updateDependentControls();
// Listen for changes
combineChunksToggle.addEventListener('change', updateDependentControls);
// Form submission handler
const form = document.getElementById('chunkForm');
const resultsContainer = document.getElementById('results');
const resultsJson = document.getElementById('resultsJson');
const downloadButton = document.getElementById('downloadButton');
const resultsFooter = document.querySelector('.results-footer');
// Clear results and hide download button initially
resultsJson.textContent = '';
resultsFooter.classList.remove('visible');
// Add spinner element reference
const spinner = document.createElement('div');
spinner.className = 'spinner';
resultsJson.parentNode.insertBefore(spinner, resultsJson);
// Add this function near the top of the file
function scrollToResults() {
if (window.innerWidth <= 800) {
const resultsWrapper = document.querySelector('.results-wrapper');
if (resultsWrapper) {
resultsWrapper.scrollIntoView({ behavior: 'smooth' });
}
}
}
// Process form handler
form.addEventListener('submit', async (e) => {
e.preventDefault();
const submitButton = form.querySelector('button[type="submit"]');
const downloadButton = document.getElementById('downloadButton');
const defaultMessage = document.getElementById('defaultMessage');
try {
submitButton.disabled = true;
downloadButton.disabled = true;
spinner.classList.add('visible');
resultsJson.style.display = 'none';
defaultMessage.style.display = 'none';
// Scroll to results as soon as we show the spinner
scrollToResults();
// Get form data and convert checkbox values to boolean
const formData = new FormData(form);
const data = Object.fromEntries(
Array.from(formData.entries()).map(([key, value]) => {
// Convert checkbox values to boolean
if (form.elements[key].type === 'checkbox') {
return [key, form.elements[key].checked];
}
return [key, value];
})
);
const startTime = performance.now();
const response = await fetch('/api/chunk', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(data)
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.details || 'Failed to process text');
}
// Display results
const result = await response.json();
const endTime = performance.now();
const processingTime = ((endTime - startTime) / 1000).toFixed(2);
spinner.classList.remove('visible');
defaultMessage.style.display = 'none';
resultsJson.style.display = 'block';
// Create code element and set content safely
const codeElement = document.createElement('code');
codeElement.className = 'language-json';
// Format the JSON string
const formattedJson = JSON.stringify(result, null, 2);
const lines = formattedJson.split('\n');
// Store full result for download
resultsJson.dataset.fullResult = formattedJson;
// Truncate if more than 1000 lines
if (lines.length > 1000) {
const truncatedLines = [
...lines.slice(0, 1000),
'\n',
'// ...',
'// ...',
'// ⚠️🚨 Notice: Data Truncated for Display 🚨⚠️',
'// The full result is too large to display here.',
'// Please use the download button to get the entire result.',
'// ...',
'// ...',
];
codeElement.textContent = truncatedLines.join('\n');
} else {
codeElement.textContent = formattedJson;
}
resultsJson.textContent = ''; // Clear existing content
resultsJson.appendChild(codeElement);
if (!data.returnEmbedding) {
hljs.highlightElement(codeElement);
}
// Enable download button if we have results
downloadButton.disabled = false;
// Calculate and display stats
if (result.length > 0) {
const numChunks = result[0].number_of_chunks;
document.getElementById('chunkCount').textContent = `Chunks: ${numChunks}`;
if (result[0].token_length !== undefined) {
const avgTokens = Math.round(
result.reduce((sum, chunk) => sum + chunk.token_length, 0) / result.length
);
document.getElementById('avgTokenLength').textContent = `Avg Tokens: ${avgTokens}`;
} else {
document.getElementById('avgTokenLength').textContent = '';
}
document.getElementById('processingTime').textContent = `Total Time: ${processingTime}s`;
} else {
document.getElementById('chunkCount').textContent = '';
document.getElementById('avgTokenLength').textContent = '';
document.getElementById('processingTime').textContent = '';
}
// Enable download
downloadButton.onclick = () => {
const fullData = resultsJson.dataset.fullResult;
const blob = new Blob([fullData], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'chunks.json';
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
};
// After results are displayed, scroll to them on mobile
scrollToResults();
} catch (error) {
console.error('Error:', error);
let errorMessage = error.message;
if (errorMessage.includes('Could not locate file:')) {
errorMessage += '<br><br>Not all models have all precision options available.';
errorMessage += '<br>Please try a different precision level and/or model and try again.';
}
showToast(errorMessage);
downloadButton.disabled = true;
spinner.classList.remove('visible');
resultsJson.style.display = 'block';
const codeElement = document.createElement('code');
codeElement.className = 'language-json';
codeElement.textContent = JSON.stringify({ error: error.message }, null, 2);
resultsJson.textContent = ''; // Clear existing content
resultsJson.appendChild(codeElement);
hljs.highlightElement(codeElement);
} finally {
submitButton.disabled = false;
submitButton.textContent = 'Process Text';
}
});
// Initialize download button as disabled
document.getElementById('downloadButton').disabled = true;
// Document buttons handler
document.querySelectorAll('.document-buttons button').forEach(button => {
button.addEventListener('click', async () => {
const fileType = button.dataset.file;
const fileName = `./documents/${fileType}.txt`;
try {
const response = await fetch(fileName);
if (!response.ok) throw new Error(`Failed to load ${fileName}`);
const text = await response.text();
document.getElementById('documentText').value = text;
document.getElementById('documentName').value = `${fileType} text`;
} catch (error) {
console.error('Error loading text file:', error);
showToast(`Failed to load ${fileName}`);
}
});
});
const modal = document.getElementById('codeModal');
const getCodeBtn = document.getElementById('getCodeButton');
const closeBtn = document.querySelector('.close');
const copyBtn = document.getElementById('copyCode');
const codeExample = document.querySelector('#codeExample code');
// Get Code button handler
getCodeBtn.onclick = () => {
// Get all form data and properly handle checkbox values
const formData = {};
const formElements = form.elements;
for (let element of formElements) {
if (element.type === 'checkbox') {
formData[element.name] = element.checked;
} else if (element.name) {
formData[element.name] = element.value;
}
}
codeExample.textContent = generateCode(formData);
modal.style.display = "block";
document.body.style.overflow = 'hidden'; // Prevent body scrolling
delete codeExample.dataset.highlighted;
hljs.highlightElement(codeExample);
};
// Generate Code function
function generateCode(formData) {
const dtypeValues = ['fp32', 'fp16', 'q8', 'q4'];
const dtype = dtypeValues[parseInt(formData.dtype)];
return `// import the semantic-chunking library
import { chunkit } from 'semantic-chunking';
// define the documents array to be chunked
const documents = [
{
document_name: "${formData.documentName}",
document_text: "Document text goes here.",
}
];
// call the chunkit function with the documents array and an options object
const myChunks = await chunkit(
documents,
{
logging: ${formData.logging},
maxTokenSize: ${formData.maxTokenSize},
similarityThreshold: ${formData.similarityThreshold},
dynamicThresholdLowerBound: ${formData.dynamicThresholdLowerBound},
dynamicThresholdUpperBound: ${formData.dynamicThresholdUpperBound},
numSimilaritySentencesLookahead: ${formData.numSimilaritySentencesLookahead},
combineChunks: ${formData.combineChunks},
combineChunksSimilarityThreshold: ${formData.combineChunksSimilarityThreshold},
onnxEmbeddingModel: "${formData.onnxEmbeddingModel}",
dtype: "${dtype}",
device: "${formData.device}",
localModelPath: "./models",
modelCacheDir: "./models",
returnEmbedding: ${formData.returnEmbedding},
returnTokenLength: ${formData.returnTokenLength},
chunkPrefix: "${formData.chunkPrefix}",
excludeChunkPrefixInResults: ${formData.excludeChunkPrefixInResults},
}
);
// log the results
console.log(myChunks);`;
}
// Close Modal button handler
closeBtn.onclick = () => {
modal.style.display = "none";
document.body.style.overflow = ''; // Restore body scrolling
};
// Close Modal on click outside
window.onclick = (event) => {
if (event.target === modal) {
modal.style.display = "none";
document.body.style.overflow = ''; // Restore body scrolling
}
};
// Copy Code button handler
copyBtn.onclick = () => {
navigator.clipboard.writeText(codeExample.textContent)
.then(() => {
copyBtn.textContent = "Copied!";
showToast('Code copied to clipboard!', 'success');
setTimeout(() => {
copyBtn.textContent = "Copy Code";
}, 2000);
})
.catch(err => {
console.error('Failed to copy:', err);
showToast('Failed to copy code to clipboard');
});
};
// Close Modal button handler
const closeModalBtn = document.getElementById('closeModal');
closeModalBtn.onclick = () => {
modal.style.display = "none";
document.body.style.overflow = ''; // Restore body scrolling
};
// Toast functionality
function showToast(message, type = 'error', duration = 7000) {
const toastContainer = document.getElementById('toastContainer');
const toast = document.createElement('div');
toast.className = `toast ${type}`;
toast.innerHTML = message;
// Clear any existing toasts
toastContainer.innerHTML = '';
toastContainer.classList.add('visible');
toastContainer.appendChild(toast);
// Function to dismiss toast
function dismissToast() {
toast.style.animation = 'fadeOut 0.3s ease-out forwards';
setTimeout(() => {
toastContainer.classList.remove('visible');
toastContainer.innerHTML = '';
}, 300);
}
// Add click handlers
toastContainer.onclick = dismissToast;
toast.onclick = (e) => {
e.stopPropagation(); // Prevent double-firing with container click
dismissToast();
};
// Auto dismiss after duration
const timeoutId = setTimeout(dismissToast, duration);
// Clear timeout if manually dismissed
toastContainer.addEventListener('click', () => {
clearTimeout(timeoutId);
}, { once: true });
}
// info icon event listener
document.querySelector('.info-icon').addEventListener('click', () => {
showToast('More model choices can be added by updating the "models.json" file in the "webui" directory.', 'info', 7000);
});
const resultsContent = document.querySelector('.results-content');
const processingTimeSpan = document.getElementById('processingTime');
// resize toggle button
const resizeToggle = document.createElement('button');
resizeToggle.className = 'resize-toggle';
resizeToggle.innerHTML = `
<svg viewBox="0 0 24 24">
<path d="M17 8.5L20 11.5L17 14.5M7 8.5L4 11.5L7 14.5M5.5 11.5H18.5"
stroke="white"
stroke-width="2"
stroke-linecap="round"
stroke-linejoin="round"
fill="none"/>
</svg>
`;
resizeToggle.title = "Toggle text wrapping";
processingTimeSpan.parentNode.insertBefore(resizeToggle, processingTimeSpan.nextSibling);
// Add click handler
resizeToggle.addEventListener('click', () => {
resultsJson.classList.toggle('wrapped');
resizeToggle.classList.toggle('wrapped');
});
// dtype display
const dtypeInput = document.getElementById('dtype');
const dtypeDisplay = dtypeInput.nextElementSibling;
function updateDtypeDisplay(value) {
const dtypeValues = {
0: { text: 'fp32 - Full Precision', class: 'precision-full' },
1: { text: 'fp16 - Half Precision', class: 'precision-half' },
2: { text: 'q8 - 8-bit Quantized', class: 'precision-q8' },
3: { text: 'q4 - 4-bit Quantized', class: 'precision-q4' }
};
const dtype = dtypeValues[value];
const number = dtypeDisplay.querySelector('.number');
const description = dtypeDisplay.querySelector('.description');
number.className = `number ${dtype.class}`;
number.textContent = value;
description.className = `description ${dtype.class}`;
description.textContent = dtype.text;
}
// Initial update
updateDtypeDisplay(dtypeInput.value);
// Update on change
dtypeInput.addEventListener('input', (e) => updateDtypeDisplay(e.target.value));
// version display
fetch('/version')
.then(response => response.json())
.then(data => {
document.getElementById('version').textContent = `v${data.version}`;
})
.catch(error => console.error('Error fetching version:', error));