UNPKG

@cao-mei-you-ren/postlight_parser

Version:

Postlight Parser transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.

112 lines (105 loc) 2.65 kB
#!/usr/bin/env node /* eslint-disable */ const Parser = require('./dist/mercury'); const package_info = require('./package.json'); const argv = require('yargs-parser')(process.argv.slice(2)); const { _: [url], format, f, extend, e, extendList, l, header, h, addExtractor, x, version, } = argv; (async ( urlToParse, contentType, extendedTypes, extendedListTypes, headers, addExtractor, version ) => { if (version) { console.log(package_info.version); process.exit(0); } if (!urlToParse) { console.log( '\n\ postlight-parser\n\n\ The Postlight Parser extracts semantic content from any url\n\n\ Usage:\n\ \n\ $ postlight-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\ \n\ ' ); return; } try { const contentTypeMap = { html: 'html', markdown: 'markdown', md: 'markdown', text: 'text', txt: 'text', }; const extensions = {}; [].concat(extendedTypes || []).forEach(t => { const [name, selector] = t.split('='); const fullSelector = selector.indexOf('|') > 0 ? selector.split('|') : selector; extensions[name] = { selectors: [fullSelector] }; }); [].concat(extendedListTypes || []).forEach(t => { const [name, selector] = t.split('='); const fullSelector = selector.indexOf('|') > 0 ? selector.split('|') : selector; extensions[name] = { selectors: [fullSelector], allowMultiple: true, }; }); // Attempt to load custom extractor from path. let customExtractor; if (addExtractor) { customExtractor = require(addExtractor); } const result = await Parser.parse(urlToParse, { contentType: contentTypeMap[contentType], extend: extensions, headers, customExtractor, }); console.log(JSON.stringify(result, null, 2)); } catch (e) { if (e.message === 'ETIMEDOUT' && false) { console.error( '\nPostlight Parser encountered a timeout trying to load that resource.' ); } else { console.error( '\nPostlight Parser encountered a problem trying to parse that resource.\n' ); console.error(e); } const reportBug = 'If you believe this was an error, please file an issue at:\n\n https://github.com/postlight/parser/issues/new'; console.error(`\n${reportBug}\n`); process.exit(1); } })( url, format || f, extend || e, extendList || l, header || h, addExtractor || x );