@forge-ml/rag
Version:
A RAG (Retrieval-Augmented Generation) package for Forge ML
64 lines (63 loc) • 1.79 kB
JavaScript
//helper functions for cleaning text
//remove HTML entities
function removeHTML(text) {
// Remove HTML entities
text = text.replace(/ /g, " ");
text = text.replace(/</g, "<");
text = text.replace(/>/g, ">");
text = text.replace(/&/g, "&");
return text;
}
//remove email headers
function removeEmailHeaders(text) {
return text.replace(/^(From|To|Sent|Subject):.*$/gm, "");
}
//remove URLs
function removeURLs(text) {
return text.replace(/https?:\/\/\S+/g, "");
}
//remove extra whitespace
function removeExtraWhitespace(text) {
return text.replace(/\s+/g, " ");
}
//remove lines with only whitespace
function removeLinesWithOnlyWhitespace(text) {
return text
.split("\n")
.filter((line) => line.trim() !== "")
.join("\n");
}
function removeNewLines(text, preserveNewlines) {
if (preserveNewlines) {
// Remove repeated newlines (more than 2)
text = text.replace(/\n{3,}/g, "\n\n");
}
else {
// Replace all newlines with spaces
text = text.replace(/\n/g, " ");
}
return text;
}
//trim whitespace
function trimWhitespace(text) {
return text.trim();
}
export function cleanText(text, preserveNewlines = true) {
// Remove HTML entities
text = removeHTML(text);
// Remove email headers
text = removeEmailHeaders(text);
// Remove URLs
text = removeURLs(text);
// Remove extra whitespace
text = removeExtraWhitespace(text);
// Remove lines with only whitespace
text = removeLinesWithOnlyWhitespace(text);
// Remove newlines
text = removeNewLines(text, preserveNewlines);
//remove empty lines
text = removeLinesWithOnlyWhitespace(text);
//trim whitespace
text = trimWhitespace(text);
return text;
}