@drjohnvidler/google-scholar-scrape
Version:
Scrapes Google Scholar data to a usable JSON API
119 lines (93 loc) • 3.63 kB
JavaScript
import { parse } from 'node-html-parser';
import puppeteer from 'puppeteer';
import crypto from 'crypto';
import fs from 'fs';
const CACHE_ROOT = process.env.CACHE_ROOT || '.cache';
const CACHE_EXPIRY = 1000 * 60 * 60 * 24 * 30; // 30 days
const CACHE_ENTROPY = CACHE_EXPIRY * 0.3;
export default class GoogleScholar {
constructor() {
if( !fs.existsSync( CACHE_ROOT ) )
fs.mkdirSync( CACHE_ROOT, { recursive: true } );
this.browser = null;
this.page = null;
}
async runQuery( url ) {
console.log( `Pulling ${url}` );
const hash = crypto.createHash('sha256').update( url ).digest( 'hex' );
const cachepath = `${CACHE_ROOT}/${hash}.json`;
if( fs.existsSync( cachepath ) ) {
const record = JSON.parse( await fs.promises.readFile( cachepath, 'utf8' ) );
if( record.validUntil > new Date().getTime() ) {
const html = Buffer.from( record.data, 'base64' ).toString('utf8');
return parse( html );
}
await fs.promises.unlink( cachepath );
}
if( this.browser == null )
this.browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu'
]
});
if( this.page == null )
this.page = await this.browser.newPage();
await this.page.goto( url );
const data = await this.page.content();
const record = {
"validUntil": new Date().getTime() + (CACHE_EXPIRY - (Math.random() * CACHE_ENTROPY)),
"data": Buffer.from( data ).toString('base64')
}
await fs.promises.writeFile( cachepath, JSON.stringify( record ) );
await this.page.close();
await this.browser.close();
this.page = null;
this.browser = null;
return parse( data );
}
async getCitations( user ) {
const params = {
"user": user,
"hl": "en",
"view_op": "list_works",
"cstart": 0,
"pagesize": 1000
};
const url = `https://scholar.google.com/citations?${ Object.keys(params).map( key => `${key}=${encodeURIComponent(params[key])}` ).join('&') }`;
const listings = [];
const out = await this.runQuery( url );
const rows = out.querySelectorAll("tr.gsc_a_tr");
for( const row of rows ) {
const link = row.querySelectorAll("a.gsc_a_at")[0];
const title = link.textContent;
const citations = parseInt( row.querySelectorAll("td.gsc_a_c")[0].textContent ) || 0;
const year = row.querySelectorAll("td.gsc_a_y")[0].textContent || "";
const pubData = await this.runQuery( `https://scholar.google.com${link.getAttribute("href")}` );
const record = {
"title": title,
"citations": citations,
"year": year
};
const pubInfo = pubData.querySelectorAll("div.gsc_oci_title_ggi")[0];
if( pubInfo ) {
const pubLink = pubInfo.querySelectorAll("a")[0];
record.fullText = pubLink.getAttribute("href");
}
const info = pubData.querySelectorAll("div.gs_scl");
for( const datum of info ) {
let key = datum.childNodes[0].textContent.toLowerCase();
let value = datum.childNodes[1].textContent;
if( key.includes('authors') )
value = value.split(",").map( author => author.trim() );
record[key] = value;
}
listings.push( record );
}
return listings;
}
}