echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
42 lines (28 loc) • 1.02 kB
text/typescript
import { splitToParagraphs, wordCharacterRegExp } from '../nlp/Segmentation.js'
import { Logger } from './Logger.js'
export async function parseWikipediaArticle(articleName: string, language: string) {
const logger = new Logger()
await logger.startAsync('Fetching Wikipedia article')
const { default: wtf } = await import('wtf_wikipedia')
const document = await wtf.fetch(articleName, language)
if (!document) {
throw new Error('Error fetching Wikipedia article')
}
const sections = document.sections()
const sectionsText: string[] = []
for (const section of sections) {
const sectionTitle = section.title()
if (wordCharacterRegExp.test(sectionTitle)) {
sectionsText.push(sectionTitle)
}
const sectionParagraphs = splitToParagraphs(section.text(), 'single', 'preserve')
for (const paragraph of sectionParagraphs) {
const paragraphText = paragraph
if (wordCharacterRegExp.test(paragraphText)) {
sectionsText.push(paragraphText)
}
}
}
logger.end()
return sectionsText
}