@aidarkezio/main-func
Version:
📦 It has my funcs
104 lines (97 loc) • 3.36 kB
text/typescript
import cheerio from 'cheerio'
import got from 'got'
import { ScraperError } from '../utils.js'
import { Wikipedia, WikipediaArgsSchema, WikipediaSchema } from './types.js'
export default async function wikipedia (
query: string,
lang: 'en' | 'id' = 'id'
): Promise<Wikipedia> {
WikipediaArgsSchema.parse(arguments)
const html = await getHtml(lang, query)
const $ = cheerio.load(html)
const title = $('#firstHeading > i').text().trim()
const img = getImgLink($('td.infobox-image > a.image > img[src]').attr('src'))
const articles: string[] = []
let end = false
let start = false
$('#mw-content-text > div.mw-parser-output')
.children()
.map(function () {
if (/p|h[2-4]|div/.test(this.name) && !end) {
let text = ''
const h = /h[2-4]/.test(this.name)
const div = /div/.test(this.name)
const el = $(this)
if (
h &&
/referen|Примечания|Notes_et_références/i.test(
el.find('span.mw-headline').attr('id') as string
)
) { return (end = true) }
const math = $(this).find('span.mwe-math-element')
if (math.length) {
math.replaceWith(
$(
`<span>${math
.text()
.trim()
.replace(/(.*displaystyle.*|\\n)/, '')}</span>`
)
)
}
if (
div &&
el.hasClass('thumb') &&
el.find('div.thumbinner > a > img[src]').length
) {
text = getImgLink(
el.find('div.thumbinner > a > img[src]').attr('src')
)
} else if (div && el.find('div > ol > li[id]').length) {
el.find('div > ol > li[id]').each(function () {
text += $(this).text().trim() + '\n'
})
} else text = el.text().trim()
if (!start && this.name === 'p' && !end && text) start = true
if (text && start && !el.find('div > ul > li').length) { articles.push((h ? '\n' : '') + text) }
}
return true
})
const res = {
title,
img,
articles: articles.join('\n\n')
}
return WikipediaSchema.parse(res)
}
function isSupportLang (lang: string): boolean {
return ['en', 'id'].includes(lang)
}
async function getHtml (lang: string, query: string): Promise<string> {
query = encodeURIComponent(query.trim())
const defaultLink = `https://${isSupportLang(lang) ? lang : 'id'
}.wikipedia.org`
let res = await got(defaultLink + '/wiki/' + query)
if (!(res.statusCode === 404)) return res.body
const link = `${defaultLink}/w/index.php?${lang === 'id'
? `title=Istimewa:Pencarian&search=${query}&fulltext=1&ns0=1`
: `search=${query}&title=Special:Search&profile=advanced&fulltext=1&ns0=1`
}`
res = await got(link)
const html = res.body
const $ = cheerio.load(html)
const results: string[] = []
$('ul.mw-search-results > li.mw-search-result').each(function () {
const link = $(this)
.find('div.mw-search-result-heading > a[href]')
.attr('href')
?.trim()
if (link) results.push(encodeURI(link))
})
if (results[0]) return (await got(defaultLink + results[0])).body
throw new ScraperError('404 Not Found!!')
}
function getImgLink (link: string = ''): string {
if (!/https:/i.test(link)) link = encodeURI('https:' + link)
return link
}