UNPKG

@drjohnvidler/google-scholar-scrape

Version:

Scrapes Google Scholar data to a usable JSON API

119 lines (93 loc) 3.63 kB
import { parse } from 'node-html-parser'; import puppeteer from 'puppeteer'; import crypto from 'crypto'; import fs from 'fs'; const CACHE_ROOT = process.env.CACHE_ROOT || '.cache'; const CACHE_EXPIRY = 1000 * 60 * 60 * 24 * 30; // 30 days const CACHE_ENTROPY = CACHE_EXPIRY * 0.3; export default class GoogleScholar { constructor() { if( !fs.existsSync( CACHE_ROOT ) ) fs.mkdirSync( CACHE_ROOT, { recursive: true } ); this.browser = null; this.page = null; } async runQuery( url ) { console.log( `Pulling ${url}` ); const hash = crypto.createHash('sha256').update( url ).digest( 'hex' ); const cachepath = `${CACHE_ROOT}/${hash}.json`; if( fs.existsSync( cachepath ) ) { const record = JSON.parse( await fs.promises.readFile( cachepath, 'utf8' ) ); if( record.validUntil > new Date().getTime() ) { const html = Buffer.from( record.data, 'base64' ).toString('utf8'); return parse( html ); } await fs.promises.unlink( cachepath ); } if( this.browser == null ) this.browser = await puppeteer.launch({ headless: true, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu' ] }); if( this.page == null ) this.page = await this.browser.newPage(); await this.page.goto( url ); const data = await this.page.content(); const record = { "validUntil": new Date().getTime() + (CACHE_EXPIRY - (Math.random() * CACHE_ENTROPY)), "data": Buffer.from( data ).toString('base64') } await fs.promises.writeFile( cachepath, JSON.stringify( record ) ); await this.page.close(); await this.browser.close(); this.page = null; this.browser = null; return parse( data ); } async getCitations( user ) { const params = { "user": user, "hl": "en", "view_op": "list_works", "cstart": 0, "pagesize": 1000 }; const url = `https://scholar.google.com/citations?${ Object.keys(params).map( key => `${key}=${encodeURIComponent(params[key])}` ).join('&') }`; const listings = []; const out = await this.runQuery( url ); const rows = out.querySelectorAll("tr.gsc_a_tr"); for( const row of rows ) { const link = row.querySelectorAll("a.gsc_a_at")[0]; const title = link.textContent; const citations = parseInt( row.querySelectorAll("td.gsc_a_c")[0].textContent ) || 0; const year = row.querySelectorAll("td.gsc_a_y")[0].textContent || ""; const pubData = await this.runQuery( `https://scholar.google.com${link.getAttribute("href")}` ); const record = { "title": title, "citations": citations, "year": year }; const pubInfo = pubData.querySelectorAll("div.gsc_oci_title_ggi")[0]; if( pubInfo ) { const pubLink = pubInfo.querySelectorAll("a")[0]; record.fullText = pubLink.getAttribute("href"); } const info = pubData.querySelectorAll("div.gs_scl"); for( const datum of info ) { let key = datum.childNodes[0].textContent.toLowerCase(); let value = datum.childNodes[1].textContent; if( key.includes('authors') ) value = value.split(",").map( author => author.trim() ); record[key] = value; } listings.push( record ); } return listings; } }