generator-begcode
Version:
Spring Boot + Angular/React/Vue in one handy generator
55 lines (54 loc) • 1.87 kB
JavaScript
import { load } from 'cheerio';
export class HTMLChunker {
config;
constructor(config) {
this.config = config;
}
sanitize(html) {
return html
.replaceAll('\t', '')
.replaceAll('\\\\t', '')
.replaceAll('\n', ' ')
.replaceAll('\\\\n', '\n')
.replace(/ +(?= )/g, '')
.trim();
}
chunk(html) {
const $ = load(html);
const chunks = [];
const extractContent = (element) => {
let chunk = $(element).text().trim() || '';
chunk = this.sanitize(chunk);
while (chunk.length > this.config.maxChunkSize) {
let indexToSplit = chunk.lastIndexOf('<', this.config.maxChunkSize);
if (indexToSplit === -1) {
indexToSplit = this.config.maxChunkSize;
}
const subChunk = chunk.substring(0, indexToSplit).trim();
if (subChunk)
chunks.push(subChunk);
chunk = this.sanitize(chunk.substring(indexToSplit).trim());
}
if (chunk && !chunks.includes(chunk)) {
chunks.push(chunk);
}
};
const processElement = (selector) => {
$(selector).each(function () {
if ($(this).find(selector).length > 0) {
$(this)
.children(selector)
.each(function () {
extractContent(this);
});
}
else {
const siblings = $(this).nextUntil(`:not(${selector})`).addBack();
extractContent(siblings);
}
});
};
['div', 'section', 'article'].forEach(processElement);
return chunks;
}
}