jarviscrawlercore
Version:
jarvis crawler core
227 lines (169 loc) • 6.93 kB
JavaScript
const {mgrPlugins} = require('./pluginsmgr');
const log = require('../../src/log');
/**
* ismine
* @param {string} url - URL
* @return {bool} ismine
*/
function ismine(url) {
if (url.indexOf('https://zhuanlan.zhihu.com/p/') == 0) {
return true;
}
return false;
}
/**
* exportArticle
* @param {object} page - page
* @return {ExportArticleResult} result - result
*/
async function exportArticle(page) {
const dom = await page.$eval('.Post-content', (element) => {
return element.innerHTML;
});
await page.setContent(dom);
let errret = undefined;
const ret = await page
.evaluate(async () => {
const ret = {};
ret.imgs = [];
ret.paragraphs = [];
window.waitimgs = 0;
const objbody = getElement('body');
if (objbody) {
const objhead = document.createElement('div');
objhead.className = 'article-head';
objbody.appendChild(objhead);
const objarticlebody = document.createElement('div');
objarticlebody.className = 'article-body';
objbody.appendChild(objarticlebody);
const imghead = getElement('.TitleImage');
if (imghead) {
ret.titleImage = await fetchImage(imghead.src);
const curnode = document.createElement('p');
curnode.style.cssText = 'text-align: center;';
const curimg = document.createElement('img');
curimg.src = imghead.src;
curnode.appendChild(curimg);
objhead.appendChild(curnode);
}
const title = getElement('h1');
if (title) {
const objtitle = document.createElement('h1');
objtitle.innerText = title.innerText;
objhead.appendChild(objtitle);
ret.title = objtitle.innerText;
}
const author = getElement('.AuthorInfo-head');
if (author) {
const objauthor = document.createElement('div');
objauthor.className = 'article-author';
objauthor.innerText = author.children[0].innerText;
objhead.appendChild(objauthor);
ret.author = objauthor.innerText;
}
const articletime = getElement('.ContentItem-time');
if (articletime) {
const objarticletime = document.createElement('div');
objarticletime.className = 'article-time';
const varreg = new RegExp(
'([1-9]\\d{3}-(0[1-9]|1[0-2])-' + '(0[1-9]|[1-2][0-9]|3[0-1]))',
'ig'
);
curtime = varreg.exec(articletime.innerText)[0];
objarticletime.innerText = curtime; // articletime.innerText;
objhead.appendChild(objarticletime);
ret.writeTime = objarticletime.innerText;
}
const articlenode = getElement('.RichText.ztext.Post-RichText');
if (articlenode) {
// if (articlenode.children.length > 1) {
// articlenode = articlenode.children[1];
// }
for (let i = 0; i < articlenode.children.length; ++i) {
if (
articlenode.children[i].tagName != 'P' &&
articlenode.children[i].tagName != 'H2' &&
articlenode.children[i].tagName != 'FIGURE'
) {
continue;
}
if (articlenode.children[i].tagName == 'FIGURE') {
let curimgs = articlenode.children[i].getElementsByTagName('div');
if (curimgs.length > 0) {
ret.imgs.push(await fetchImage(curimgs[0].dataset['src']));
ret.paragraphs.push({pt: 2, imgURL: curimgs[0].src});
const curnode = document.createElement('p');
curnode.style.cssText = 'text-align: center;';
const curimg = document.createElement('img');
curimg.onload = () => {
ret.imgs[ret.imgs.length - 1].width = curimg.width;
ret.imgs[ret.imgs.length - 1].height = curimg.height;
if (window.waitimgs > 0) {
--window.waitimgs;
}
};
curimg.src = curimgs[0].dataset['src'];
++window.waitimgs;
curnode.appendChild(curimg);
objarticlebody.appendChild(curnode);
continue;
}
curimgs = articlenode.children[i].getElementsByTagName('img');
if (curimgs.length > 0) {
ret.imgs.push(await fetchImage(curimgs[0].src));
ret.paragraphs.push({pt: 2, imgURL: curimgs[0].src});
const curnode = document.createElement('p');
curnode.style.cssText = 'text-align: center;';
const curimg = document.createElement('img');
curimg.onload = () => {
ret.imgs[ret.imgs.length - 1].width = curimg.width;
ret.imgs[ret.imgs.length - 1].height = curimg.height;
if (window.waitimgs > 0) {
--window.waitimgs;
}
};
curimg.src = curimgs[0].src;
++window.waitimgs;
curnode.appendChild(curimg);
objarticlebody.appendChild(curnode);
continue;
}
} else if (articlenode.children[i].tagName == 'H2') {
const curnode = document.createElement('h2');
curnode.innerText = articlenode.children[i].innerText;
// curnode.className = 'article-body-h1';
ret.paragraphs.push({pt: 3, text: curnode.innerText});
objarticlebody.appendChild(curnode);
} else {
const curnode = document.createElement('p');
curnode.innerText = articlenode.children[i].innerText;
ret.paragraphs.push({pt: 1, text: curnode.innerText});
objarticlebody.appendChild(curnode);
}
}
}
}
// const lsttag = $('.column-link');
// if (lsttag.length > 0) {
// ret.tags = [];
// for (let i = 0; i < lsttag.length; ++i) {
// ret.tags.push(lsttag[i].innerText);
// }
// }
clearArticleElement(objbody);
ret.article = objbody.innerText;
return ret;
})
.catch((err) => {
log.error('zhihu.article:getQuestion.evaluate', err);
errret = err;
});
await page.waitForFunction('window.waitimgs == 0').catch((err) => {
log.error('zhihu.article.formatArticle', err);
});
return {
result: ret,
err: errret,
};
}
mgrPlugins.regExportArticle('zhihu.article', ismine, exportArticle);