echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

github.com/echogarden-project/echogarden

echogarden-project/echogarden

42 lines (28 loc) • 1.02 kB

text/typescript

View Raw

import { splitToParagraphs, wordCharacterRegExp } from '../nlp/Segmentation.js' import { Logger } from './Logger.js' export async function parseWikipediaArticle(articleName: string, language: string) { const logger = new Logger() await logger.startAsync('Fetching Wikipedia article') const { default: wtf } = await import('wtf_wikipedia') const document = await wtf.fetch(articleName, language) if (!document) { throw new Error('Error fetching Wikipedia article') } const sections = document.sections() const sectionsText: string[] = [] for (const section of sections) { const sectionTitle = section.title() if (wordCharacterRegExp.test(sectionTitle)) { sectionsText.push(sectionTitle) } const sectionParagraphs = splitToParagraphs(section.text(), 'single', 'preserve') for (const paragraph of sectionParagraphs) { const paragraphText = paragraph if (wordCharacterRegExp.test(paragraphText)) { sectionsText.push(paragraphText) } } } logger.end() return sectionsText }