echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
29 lines • 1.17 kB
JavaScript
import { splitToParagraphs, wordCharacterRegExp } from '../nlp/Segmentation.js';
import { Logger } from './Logger.js';
export async function parseWikipediaArticle(articleName, language) {
const logger = new Logger();
await logger.startAsync('Fetching Wikipedia article');
const { default: wtf } = await import('wtf_wikipedia');
const document = await wtf.fetch(articleName, language);
if (!document) {
throw new Error('Error fetching Wikipedia article');
}
const sections = document.sections();
const sectionsText = [];
for (const section of sections) {
const sectionTitle = section.title();
if (wordCharacterRegExp.test(sectionTitle)) {
sectionsText.push(sectionTitle);
}
const sectionParagraphs = splitToParagraphs(section.text(), 'single', 'preserve');
for (const paragraph of sectionParagraphs) {
const paragraphText = paragraph;
if (wordCharacterRegExp.test(paragraphText)) {
sectionsText.push(paragraphText);
}
}
}
logger.end();
return sectionsText;
}
//# sourceMappingURL=WikipediaReader.js.map