UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

29 lines 1.17 kB
import { splitToParagraphs, wordCharacterRegExp } from '../nlp/Segmentation.js'; import { Logger } from './Logger.js'; export async function parseWikipediaArticle(articleName, language) { const logger = new Logger(); await logger.startAsync('Fetching Wikipedia article'); const { default: wtf } = await import('wtf_wikipedia'); const document = await wtf.fetch(articleName, language); if (!document) { throw new Error('Error fetching Wikipedia article'); } const sections = document.sections(); const sectionsText = []; for (const section of sections) { const sectionTitle = section.title(); if (wordCharacterRegExp.test(sectionTitle)) { sectionsText.push(sectionTitle); } const sectionParagraphs = splitToParagraphs(section.text(), 'single', 'preserve'); for (const paragraph of sectionParagraphs) { const paragraphText = paragraph; if (wordCharacterRegExp.test(paragraphText)) { sectionsText.push(paragraphText); } } } logger.end(); return sectionsText; } //# sourceMappingURL=WikipediaReader.js.map