UNPKG

article-parser

Version:

To extract main article from given URL

128 lines (105 loc) 2.59 kB
// utils -> parseFromHtml import { stripTags, truncate, unique, pipe } from 'bellajs' import { purify, cleanify } from './html.js' import { isValid as isValidUrl, purify as purifyUrl, absolutify as absolutifyUrl, normalize as normalizeUrls, chooseBestUrl, getDomain } from './linker.js' import extractMetaData from './extractMetaData.js' import extractWithReadability, { extractTitleWithReadability } from './extractWithReadability.js' import { execPreParser, execPostParser } from './transformation.js' import getTimeToRead from './getTimeToRead.js' const summarize = (desc, txt, threshold, maxlen) => { return desc.length > threshold ? desc : truncate(txt, maxlen).replace(/\n/g, ' ') } export default async (inputHtml, inputUrl = '', parserOptions = {}) => { const html = purify(inputHtml) const meta = extractMetaData(html) let title = meta.title const { url, shortlink, amphtml, canonical, description: metaDesc, image: metaImg, author, published } = meta const { wordsPerMinute = 300, descriptionTruncateLen = 210, descriptionLengthThreshold = 180, contentLengthThreshold = 200 } = parserOptions // gather title if (!title) { title = extractTitleWithReadability(html, inputUrl) } if (!title) { return null } // gather urls to choose the best url later const links = unique( [url, shortlink, amphtml, canonical, inputUrl] .filter(isValidUrl) .map(purifyUrl) ) if (!links.length) { return null } // choose the best url const bestUrl = chooseBestUrl(links, title) const fns = pipe( (input) => { return normalizeUrls(input, bestUrl) }, (input) => { return execPreParser(input, links) }, (input) => { return extractWithReadability(input, bestUrl) }, (input) => { return input ? execPostParser(input, links) : null }, (input) => { return input ? cleanify(input) : null } ) const content = fns(html) if (!content) { return null } const textContent = stripTags(content) if (textContent.length < contentLengthThreshold) { return null } const description = summarize( metaDesc, textContent, descriptionLengthThreshold, descriptionTruncateLen ) const image = metaImg ? absolutifyUrl(bestUrl, metaImg) : '' return { url: bestUrl, title, description, links, image, content, author, source: getDomain(bestUrl), published, ttr: getTimeToRead(textContent, wordsPerMinute) } }