bb-crawler
Version:
[](https://www.npmjs.com/package/bb-crawler) [](https://www.npmjs.com/package/bb-crawler)
733 lines (620 loc) • 19.7 kB
text/typescript
import * as OSS from 'ali-oss'
import * as Rds from 'ali-rds'
import * as Redis from 'ioredis'
import * as Cos from 'cos-nodejs-sdk-v5'
import * as cheerio from 'cheerio'
import * as async from 'async'
import * as Axios from 'axios'
import * as schedule from 'node-schedule'
import * as crypto from 'crypto'
import {Buffer} from 'buffer'
import {URL} from 'url'
import * as fs from 'fs'
import * as path from 'path'
import * as mkdirp from 'mkdirp'
import * as winston from 'winston'
import * as _ from 'lodash'
import * as util from 'util'
import * as Urllib from 'urllib';
class Config {
urls?: string[];
url?: string;
cron: string | any;
mysql?: any;
redis?: any;
oss?: any;
cos?: any;
env?: any;
axios?: any;
logger?: any;
cache?: {
prefix: string
} | any;
beforeRequest?: Function;
request?: Function;
format?: Function;
store?: Function;
release?: Function;
cache_url?:Function;
request_url?:Function;
urllib?: any;;
network?: any;;
}
interface Cache {
hasCache(key): Promise<any>;
get(key): Promise<any>;
set(key, value): Promise<any>;
}
export class CacheOSS implements Cache {
ctx: Context;
prefix: string;
encoding: BufferEncoding;
oss: OSS;
constructor(ctx) {
this.ctx = ctx;
const {prefix, oss, encoding} = ctx.config.cache;
this.prefix = prefix || '';
this.encoding = encoding || 'utf8';
if (oss) {
this.oss = new OSS(oss);
} else {
this.oss = ctx.oss;
}
}
async hasCache(key) {
try {
await this.oss.head(`${this.prefix}${key}`)
return true;
} catch (e) {
return false;
}
}
async get(key) {
const result = await this.oss.get(`${this.prefix}${key}`);
return result.content.toString(this.encoding);
}
async set(key, value) {
if(!value){
return
}
await this.oss.put(`${this.prefix}${key}`, Buffer.from(value, this.encoding));
}
}
export class CacheCos implements Cache {
cos: Cos;
ctx: Context;
prefix: string;
Bucket: string;
Region: string;
constructor(ctx) {
this.ctx = ctx;
const {prefix, cos, Bucket, Region} = ctx.config.cache;
this.prefix = prefix || ''
if (cos) {
this.cos = new Cos(cos);
this.Bucket = Bucket || cos.Bucket;
this.Region = Region || cos.Region;
} else {
this.cos = ctx.cos;
this.Bucket = Bucket || ctx.config.cos.Bucket;
this.Region = Region || ctx.config.cos.Bucket;
}
}
async hasCache(key) {
return new Promise((resolve, reject) => {
this.cos.headObject({
Bucket: this.Bucket, // Bucket 格式:test-1250000000
Region: this.Region, // Bucket 格式:test-1250000000
Key: `${this.prefix}${key}`
}, function (err, data) {
if (err) {
resolve(false)
return
}
resolve(true)
});
})
}
async get(key) {
return new Promise((resolve, reject) => {
this.cos.getObject({
Bucket: this.Bucket, // Bucket 格式:test-1250000000
Region: this.Region, // Bucket 格式:test-1250000000
Key: `${this.prefix}${key}`
}, function (err, data) {
if (err) {
reject(err);
return
}
resolve(data.Body)
});
})
}
async set(key, value) {
if(!value){
return
}
return new Promise((resolve, reject) => {
this.cos.putObject({
Bucket: this.Bucket, // Bucket 格式:test-1250000000
Region: this.Region, // Bucket 格式:test-1250000000
Key: `${this.prefix}${key}`,
Body: value
}, function (err, data) {
if (err) {
reject(err);
return
}
resolve(data)
});
})
}
}
export class CacheFile implements Cache {
prefix: string;
encoding: BufferEncoding;
constructor(ctx) {
const {prefix, encoding} = ctx.config.cache;
this.prefix = prefix || path.join(process.cwd(), 'files');
this.encoding = encoding || 'utf8';
}
async hasCache(key) {
return fs.existsSync(path.join(this.prefix, key))
}
async get(key) {
return fs.readFileSync(path.join(this.prefix, key), {encoding: this.encoding})
}
async set(key, value) {
mkdirp.sync(path.parse(path.join(this.prefix, key)).dir)
return fs.writeFileSync(path.join(this.prefix, key), value, {encoding: this.encoding})
}
}
export class Utils {
md5(content) {
return crypto.createHash('md5').update(content).digest("hex")
}
base64md5(content) {
return crypto.createHash('md5').update(Buffer.from(content, 'utf8').toString('base64')).digest("hex")
}
hexmd5(content) {
return crypto.createHash('md5').update(Buffer.from(content, 'utf8').toString('hex')).digest("hex")
}
url_path(url) {
if (!url) {
return url
}
return url.replace(/:/g, '_')
.replace(/\//g, '_')
.replace(/\?/g, '_')
.replace(/&/g, '__')
}
url_dir(url) {
let urlObj = new URL(url);
return `${urlObj.hostname}/${this.url_path(`${urlObj.pathname}${urlObj.search}`)}`;
}
}
interface Network {
get(url, params?:any,config?:any);
getJSON(url, params?:any,config?:any);
postJSON(url, data?:any,config?:any);
postJSON_rText(url, data?:any,config?:any)
}
class NetworkAxios implements Network {
axios: Axios.AxiosInstance;
constructor(config = {}) {
this.axios = Axios.default.create(config)
}
get(url, params?:any,config?:any) {
return this.axios.get(url, Object.assign({},{params, responseType: 'text'},config))
}
getJSON(url, params?:any,config?:any) {
return this.axios.get(url, Object.assign({},{params},config))
}
postJSON(url, data?:any,config?:any) {
return this.axios.get(url, Object.assign({},{data, method: 'post'},config))
}
postJSON_rText(url, data?:any,config?:any) {
return this.axios.get(url, Object.assign({},{data, method: 'post', responseType: 'text'},config))
}
}
class NetworkUrlLib implements Network {
config: any;
maxConnect: number;
constructor(config = {}) {
const config_:any = this.config = Object.assign({},config);
this.make_proxy(this.config)
if(this.config.maxConnect){
this.maxConnect = 5;
}else{
this.maxConnect = 1;
}
}
make_proxy(config?:any){
if(config && config.proxy){
if(config.proxy.host){
let proxy_str = `${config.proxy.protocol}://${config.proxy.host}:${config.proxy.port}`;
if(config.proxy.auth){
proxy_str = `${config.proxy.protocol}://${config.proxy.auth.username}:${config.proxy.auth.password}@${config.proxy.host}:${config.proxy.port}`;
}
config.proxy= proxy_str;
}
}
}
get(url, params?:any,config?:any) {
this.make_proxy(config)
return Urllib.request(url,Object.assign({ method: 'GET',data:params, dataType: 'text'},this.config,config))
}
getJSON(url, params?:any,config?:any){
this.make_proxy(config)
return Urllib.request(url,Object.assign({ method: 'GET',data:params, dataType: 'json'},this.config,config))
}
postJSON(url, data?:any,config?:any){
this.make_proxy(config)
return Urllib.request(url, Object.assign({method: 'POST',data, dataType: 'json'},this.config,config))
}
postJSON_rText(url, data?:any,config?:any){
this.make_proxy(config)
return Urllib.request(url,Object.assign({ method: 'POST',data, dataType: 'text'},this.config,config))
}
}
class Task{
handler: Function;
args?:any;
constructor(handler,args){
this.handler = handler;
this.args = args;
}
exec(){
}
}
class ThreadPool {
maxConnect: number;
timeout: number;
tasks: [];
constructor(config){
Object.assign(this,config);
if(!this.maxConnect){
this.maxConnect = 1
}
if(!this.timeout){
this.timeout = 100000;
}
}
run(handler,args){
let task = new Task(handler,args);
}
}
class Context {
config: Config;
mysql: Rds;
redis: Redis;
oss: OSS;
cos: Cos;
jquery: any;
axios: any;
urllib: any;
network: any;
res: any;
data: any;
utils: Utils;
cache: Cache;
logger: winston.Logger;
done:boolean;
constructor(config) {
this.config = config;
// this.init();
}
async init() {
const {mysql, redis, oss, cos, axios, env, cache,urllib,network} = this.config;
this.jquery = cheerio;
this.utils = new Utils();
if (mysql instanceof Rds) {
this.mysql = mysql;
} else if (mysql) {
this.mysql = new Rds(mysql)
}
if (redis instanceof Redis) {
this.redis = redis;
} else if (redis) {
this.redis = new Redis(redis)
}
if (oss instanceof OSS) {
this.oss = oss;
} else if (oss) {
this.oss = new OSS(oss)
}
if (cos instanceof Cos) {
this.cos = cos;
} else if (cos) {
this.cos = new Cos(cos)
}
if(axios){
this.axios = Axios.default.create(axios);
}else {
this.axios = Axios.default.create(axios);
}
this.urllib = Urllib;
if (network && network.type ==='axios') {
this.network = new NetworkAxios(axios);
} else {
this.network = new NetworkUrlLib(urllib);
}
if (cache) {
const {type} = cache;
switch (type) {
case 'file':
this.cache = new CacheFile(this);
break;
case 'oss':
this.cache = new CacheOSS(this);
break;
case 'cos':
this.cache = new CacheCos(this);
break;
}
}
if (env) {
await env(this);
}
}
async cache_url(url) {
const {cache_url} = this.config;
if(cache_url){
return await cache_url(this,url)
}
return this.utils.url_dir(url);
}
async beforeRequest(url) {
const {beforeRequest} = this.config;
if (beforeRequest) {
await beforeRequest(this, url);
}
}
async request_url(url){
const {request_url} = this.config;
if(request_url){
return await request_url(this,url);
}
return url;
}
async request(url) {
const profiler = this.logger.startTimer();
const {request} = this.config;
let res = null;
let request_url = await this.request_url(url) ;
if (request) {
try{
res = await request(this, url)
}catch(e){
this.logger.error(util.format(`url:%s error: %s`, request_url, e.message), {tag: "request"})
throw e;
}
} else {
const profiler = this.logger.startTimer();
try{
res = await this.network.get(request_url)
}catch(e){
this.logger.error(util.format(`url:%s error: %s`, request_url, e.message), {tag: "axios"})
throw e;
} finally {
profiler.done({service: this.config.logger.service, tag: request_url, level: 'debug', message: 'request timer'});
}
}
return res;
}
async requestCache(ctx , url){
const {requestCache} = ctx.config
if (requestCache) {
return await requestCache(ctx,url);
}
let cacheUrl = await ctx.cache_url(url);;
return {
data:await ctx.cache.get(cacheUrl)
}
}
async format(url) {
const profiler = this.logger.startTimer();
const {format} = this.config;
let res = null;
let request_url = await this.request_url(url) ;
if (format) {
res = await format(this,url)
}else{
res = this.res.data;
}
profiler.done({service: this.config.logger.service, tag: request_url, level: 'debug', message: 'format timer'});
return res;
}
async store(url) {
const profiler = this.logger.startTimer();
const {store} = this.config;
let request_url = await this.request_url(url) ;
if (store) {
await store(this,url)
}
profiler.done({service: this.config.logger.service, tag: request_url, level: 'debug', message: 'store timer'});
}
async release() {
const {release} = this.config;
if (this.mysql) {
this.mysql.end(() => {
this.logger.debug(`mysql end`, {tag: "release"})
})
}
if (this.redis) {
this.redis.quit(() => {
this.logger.debug(`redis quit`, {tag: "release"})
})
}
if (release) {
await release(this)
this.logger.debug(`release()`, {tag: "release"})
}
}
}
const defaultConfig = {
limit: 1,
logger: {
console: true,
level: 'info',
service: "Crawler",
files: [],
}
}
export class Crawler {
ctx: Context;
config: Config | any;
job: any;
isRun: boolean;
logger: winston.Logger;
// loggerFmt(){
// return
// }
constructor(config: Config | string | any) {
if (typeof(config) === 'string') {
config = {
url: config
}
}
// if (!config.urls && config.url) {
// config.urls = [config.url]
// }
this.config = _.merge({}, defaultConfig, config);
// console.log(this.config);
this.logger = winston.createLogger({
level: this.config.logger.level,
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
defaultMeta: {service: this.config.logger.service},
transports: this.config.logger.files.map((file) => {
return new winston.transports.File(file)
}),
});
if (this.config.logger.console) {
this.logger.add(new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.timestamp(),
winston.format.json(),
winston.format.printf(
info => {
if (typeof info.durationMs !== "undefined") {
return `${info.timestamp} ${info.level} [${info.service}${info.tag ? `:${info.tag}` : ''}]: ${info.message}:${info.durationMs}`
}
return `${info.timestamp} ${info.level} [${info.service}${info.tag ? `:${info.tag}` : ''}]: ${info.message}`
})
)
}));
}
}
async urls() {
const {url, urls} = this.config;
if(url){
return [url];
}
if(urls ){
return await urls()
}
return [];
}
async newContext() {
const ctx = new Context(_.assign({}, this.config));
ctx.logger = this.logger;
try{
await ctx.init();
} catch(e){
ctx.logger.error(e.message);
await ctx.release()
return null;
}
return ctx;
}
async task(url) {
this.logger.info(util.format(`url:%j`, url), {tag: 'task'})
let ctx = await this.newContext();
if(!ctx){
throw new Error(`can't newContext`)
}
try{
await ctx.beforeRequest(url);
let request_url = await ctx.request_url(url) ;
if(ctx.done){
this.logger.info(util.format(`url:%s `, request_url), {tag: "done"})
await ctx.release();
return
}
if (ctx.cache) {
let cacheUrl = await ctx.cache_url(url);
this.logger.debug(util.format(`cacheUrl:%s `, cacheUrl), {tag: request_url});
if (await ctx.cache.hasCache(cacheUrl)) {
this.logger.debug(util.format(`hasCache`), {tag: request_url});
ctx.res = await ctx.requestCache(ctx, url);
} else {
ctx.res = await ctx.request(url);
// console.log(ctx.res);
await ctx.cache.set(cacheUrl, ctx.res.data)
}
} else {
ctx.res = await ctx.request(url);
}
ctx.data = await ctx.format(url);
await ctx.store(url);
await ctx.release();
} catch(e) {
this.logger.error(util.format(`url:%o error: %s`, url, e.message), {tag: "task"})
await ctx.release();
}
}
async tasks() {
// config =
const {limit, allowError} = this.config;
let urls = await this.urls();
this.logger.info(util.format('urls.length %d', urls.length), {tag: "tasks"})
return new Promise((resolve, reject) => {
async.eachLimit(urls, limit, async (url) => {
try {
await this.task(url);
} catch (e) {
this.logger.error(util.format(`url:%s error %s`, url, e.message), {tag: 'tasks'})
if (!allowError) {
throw e;
}
}
}, function (err) {
if (err) {
this.logger.error(util.format(`error:%s`, err.message), {tag: "tasks"});
reject(err);
return;
}
resolve(err);
})
})
}
async start() {
const {limit, cron, urls} = this.config;
if (cron) {
this.logger.info('cron', cron)
this.job = schedule.scheduleJob(cron, () => {
if (this.isRun) {
return
}
this.isRun = true;
this.tasks().then(() => {
this.isRun = false;
})
})
} else {
this.isRun = true;
this.tasks().then(() => {
this.isRun = false;
})
}
}
async stop() {
if (this.job) {
this.job.cancel();
}
}
}