UNPKG

article-parser

Version:

To extract main article from given URL

129 lines (105 loc) 2.6 kB
// utils -> parseFromHtml import { stripTags, truncate, unique, pipe } from 'bellajs' import { cleanify, cleanAndMinify as cleanAndMinifyHtml } from './html.js' import { isValid as isValidUrl, purify as purifyUrl, absolutify as absolutifyUrl, normalize as normalizeUrls, chooseBestUrl, getDomain } from './linker.js' import extractMetaData from './extractMetaData.js' import extractWithReadability, { extractTitleWithReadability } from './extractWithReadability.js' import { execPreParser, execPostParser } from './transformation.js' import getTimeToRead from './getTimeToRead.js' import { getParserOptions } from '../config.js' const summarize = (desc, txt, threshold, maxlen) => { return desc.length > threshold ? desc : truncate(txt, maxlen).replace(/\n/g, ' ') } export default async (inputHtml, inputUrl = '') => { const html = cleanify(inputHtml) const meta = extractMetaData(html) let title = meta.title const { url, shortlink, amphtml, canonical, description: metaDesc, image: metaImg, author, published } = meta const { descriptionLengthThreshold, descriptionTruncateLen, contentLengthThreshold } = getParserOptions() // gather title if (!title) { title = extractTitleWithReadability(html, inputUrl) } if (!title) { return null } // gather urls to choose the best url later const links = unique( [url, shortlink, amphtml, canonical, inputUrl] .filter(isValidUrl) .map(purifyUrl) ) if (!links.length) { return null } // choose the best url const bestUrl = chooseBestUrl(links, title) const fns = pipe( (input) => { return normalizeUrls(input, bestUrl) }, (input) => { return execPreParser(input, links) }, (input) => { return extractWithReadability(input, bestUrl) }, (input) => { return input ? execPostParser(input, links) : null }, (input) => { return input ? cleanAndMinifyHtml(input) : null } ) const content = fns(html) if (!content) { return null } const textContent = stripTags(content) if (textContent.length < contentLengthThreshold) { return null } const description = summarize( metaDesc, textContent, descriptionLengthThreshold, descriptionTruncateLen ) const image = metaImg ? absolutifyUrl(bestUrl, metaImg) : '' return { url: bestUrl, title, description, links, image, content, author, source: getDomain(bestUrl), published, ttr: getTimeToRead(textContent) } }