UNPKG

article-parser

Version:

To extract main article from given URL

demos.pwshub.com/article-parser

ndaidong/article-parser

128 lines (121 loc) • 4.51 kB

JavaScript

// parseFromHtml.test /* eslint-env jest */ import { readFileSync } from 'fs' import { isFunction } from 'bellajs' import parseFromHtml from './parseFromHtml.js' import { addTransformations } from './transformation.js' describe('test parseFromHtml()', () => { const expDesc = "Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs." const cases = [ { input: { desc: 'a webpage with no title', html: readFileSync('./test-data/html-no-title.html', 'utf8') }, expectation: null }, { input: { desc: 'a webpage without link', html: readFileSync('./test-data/html-no-link.html', 'utf8') }, expectation: null }, { input: { desc: 'a webpage with no main article', html: readFileSync('./test-data/html-no-article.html', 'utf8') }, expectation: null }, { input: { desc: 'a webpage with a very short article', html: readFileSync('./test-data/html-too-short-article.html', 'utf8'), url: 'abcd' }, expectation: null }, { input: { desc: 'a webpage with article but no source', html: readFileSync('./test-data/html-article-no-source.html', 'utf8') }, expectation: (result, expect) => { expect(result.source).toEqual('somewhere.any') } }, { input: { desc: 'a webpage with data-src in img tag', html: readFileSync('./test-data/html-article-with-data-src.html', 'utf8') }, expectation: (result, expect) => { expect(result.content).toEqual(expect.stringContaining('<img src="https://somewhere.any/image1.jpg" />')) expect(result.content).toEqual(expect.stringContaining('<img src="https://somewhere.any/image2.jpg" />')) } }, { input: { desc: 'a webpage with regular article', html: readFileSync('./test-data/regular-article.html', 'utf8'), url: 'https://somewhere.com/path/to/article' }, expectation: (result, expect) => { expect(result.title).toEqual('Article title here') expect(result.description).toEqual(expDesc) expect(result.content).toEqual(expect.stringContaining('<a target="_blank" href="https://otherwhere.com/descriptions/rational-peach">')) expect(result.content).toEqual(expect.stringContaining('<a target="_blank" href="https://somewhere.com/dict/watermelon">')) } } ] cases.forEach((acase) => { const { input, expectation } = acase const { desc, html, url = '' } = input test(`check if parseFromHtml() works with ${desc}`, async () => { const result = await parseFromHtml(html, url) if (isFunction(expectation)) { expectation(result, expect) } else { expect(result).toEqual(expectation) } }) }) test('check if parseFromHtml() works with multi transforms', async () => { addTransformations([ { patterns: [ /http(s?):\/\/need-transform.tld\/*/ ], post: (document) => { document.querySelectorAll('a').forEach((node) => { const sHtml = node.innerHTML const link = node.getAttribute('href') node.parentNode.replaceChild(document.createTextNode(`[link url="${link}"]${sHtml}[/link]`), node) }) return document } }, { patterns: [ /http(s?):\/\/sw.re\/*/ ], post: (document) => { document.querySelectorAll('strong').forEach((node) => { const b = document.createElement('B') b.innerHTML = node.innerHTML node.parentNode.replaceChild(b, node) }) return document } } ]) const html = readFileSync('./test-data/vnn-article.html', 'utf8') const url = 'https://need-transform.tld/path/to/article' const result = await parseFromHtml(html, url) expect(result.title).toEqual('Article title here') expect(result.content).toEqual(expect.not.stringContaining('<a href="https://vnn.vn/dict/watermelon" target="_blank">')) expect(result.content).toEqual(expect.stringContaining('[link url="https://vnn.vn/dict/watermelon"]watermelon[/link]')) expect(result.content).toEqual(expect.stringContaining('<b>in its own way</b>')) }) })