UNPKG

njtool

Version:
150 lines (133 loc) 4.24 kB
// This file is distributed under the MIT license. // See LICENSE file in the project root for details. 'use strict'; const moment = require('moment'); const puppeteer = require('puppeteer'); // istanbul ignore next function collectArticles2017() { const article_elements = document.querySelectorAll('#content article'); return Array.from(article_elements).map((article) => { const title = article.querySelector('a').innerText; const url = article.querySelector('a').href; return { title, url }; }); } // istanbul ignore next function collectArticles2018() { const article_elements = document.querySelectorAll('article'); return Array.from(article_elements).map((article) => { const title = article.querySelector('a').innerText; const type_element = article.querySelector('[data-test="article.type"]'); const type = type_element ? type_element.innerText : null; const date_element = article.querySelector('time'); const date = date_element ? date_element.dateTime : null; const url = article.querySelector('a').href; const desc_element = article.querySelector('[itemprop="description"] p'); const description = desc_element ? desc_element.innerText : null; const author_elements = article.querySelectorAll('[data-test="author-list"] [itemprop="name"]'); const authors = Array.from(author_elements).map((elem) => elem.innerText); return { title, type, date, description, authors, url }; }); } class Journal { constructor(id) { const [name, volume, issue] = id.split(':'); if (name === undefined || volume === undefined || issue == undefined) { throw new Error(`Invalid journal ID: ${id}`); } if (name != 'nature') { throw new Error(`Not supported at this moment: ${name}`); } this.name = name; this.volume = parseInt(volume); this.issue = parseInt(issue); this.content = null; } static from(args) { if (!Array.isArray(args)) { args = [args]; } return args.map((arg) => new Journal(arg)); } get id() { return `${this.name}:${this.volume}:${this.issue}`; } get url() { const base = 'https://www.nature.com'; if (this.volume < 553) { return `${base}/${this.name}/journal/v${this.volume}/n${this.issue}/index.html`; } return `${base}/${this.name}/volumes/${this.volume}/issues/${this.issue}`; } get metadata() { let metadata = { name: this.name, volume: this.volume, issue: this.issue, url: this.url }; if (this.content) { metadata.date = this.content.date; metadata.articles = this.content.articles; } if (this.error) { metadata.error = this.error; } return metadata; } async scrape(options) { let opt = { headless: options.headless }; if (!options.sandbox) { opt.args = ['--no-sandbox', '--disable-setuid-sandbox']; } let browser = null; try { browser = await puppeteer.launch(opt); const page = await browser.newPage(); await page.goto(this.url); const title = await page.title(); if (title.startsWith('Page not found')) { throw new Error('Not found'); } let date = this._getDateFromTitle(title); if (!date) { date = await this._getDateFromPage(page); } const articles = await page.evaluate(this._collectArticlesFunction); this.content = { date, articles }; } catch (e) { this.error = e.message; } if (browser) { await browser.close(); } return this.metadata; } get _collectArticlesFunction() { if (this.volume < 553) { return collectArticles2017; } return collectArticles2018; } _getDateFromTitle(title) { const components = title.split(', '); if (components.length < 2) { return null; } return this._convertDate(components[1]); } async _getDateFromPage(page) { const date = await page.evaluate(() => { // istanbul ignore next return document.querySelector('#issue-meta .more').innerText; }); return this._convertDate(date); } _convertDate(date) { return moment(date, 'D MMMM YYYY').format('YYYY-MM-DD'); } } module.exports = Journal;