spider-node
Version:
爬虫工具,根据传入的配置和规则爬取数据
271 lines (266 loc) • 8.09 kB
text/typescript
import { Rule } from './rule';
import { EventEmitter } from 'events';
import iconv from 'iconv-lite';
import { Logger } from 'log4js';
import rp from 'request-promise';
import { createLogger } from './utils/logger';
import { ISpider, IRule, IHttp } from '../types';
interface IHttpTask {
url: string;
config: IHttp.HttpConfig;
}
interface IRuleParams {
rule: Rule;
queue: IHttpTask[];
connect: number;
}
export class Http extends EventEmitter {
public static clone(http: Http): Http {
return new Http(http.config, http.middlewares);
}
public logger: Logger;
public delay: number = 0;
public maxConnect: number = Infinity;
public connect: number = 0;
public middlewares: IHttp.DownloadMiddleware[] = [];
public timer: NodeJS.Timeout | null = null;
public pool: Map<RegExp, IRuleParams> = new Map<RegExp, IRuleParams>();
// public ruleConnect: Map<RegExp | string, number> = new Map();
public config: IHttp.HttpConstructorConfig = {
overlist: new Set(),
cacheMap: new Map(),
meta: {},
};
// private queue = new Map<Rule, IHttpTask[]>();
constructor(
config: IHttp.HttpConstructorConfig = {
repeat: false,
meta: {},
},
middlewares?: IHttp.DownloadMiddleware[]
) {
super();
this.logger = createLogger(`${config.name}-http`, config.log);
const cfg = (this.config = { ...this.config, ...config });
if (cfg.maxConnect) {
this.maxConnect = cfg.maxConnect;
delete cfg.maxConnect;
}
if (cfg.delay) {
this.maxConnect = 1;
this.delay = cfg.delay;
delete cfg.delay;
}
if (middlewares) {
this.middlewares = [...this.middlewares, ...middlewares];
}
}
// addRuleConnect(config: Http.HttpRuleConfig) {
// if (config.rule) {
// // rule键:rule类:正则
// const key = config.rule.rule;
// const val = this.ruleConnect.get(key.rule) || 0;
// }
// }
public async request(url: string, config: IHttp.HttpConfig) {
const tmp: any = config;
const result = await rp({
url,
...tmp,
resolveWithFullResponse: true,
});
return result;
}
// 检测是否可以直接运行
public inspect(url: string, config: { rule: Rule }): boolean {
let ruleParam = this.pool.get(config.rule.rule) as IRuleParams;
let cur = ruleParam.connect;
let max = config.rule.config.maxCollect || this.maxConnect;
return cur < max;
}
public async push(
url: string,
config: IHttp.HttpConfig,
priority: boolean = false
): Promise<any> {
let ruleParam = this.pool.get(config.rule.rule);
if (!ruleParam) {
ruleParam = {
rule: config.rule,
connect: 0,
queue: [],
};
this.pool.set(config.rule.rule, ruleParam);
}
if (this.inspect(url, config)) {
this.run(url, config);
return;
}
this.logger.debug(`任务加入队列:${url}`);
const queue = ruleParam.queue;
if (priority) {
queue.unshift({ url, config });
} else {
queue.push({ url, config });
}
}
public addOverUrl(url: string) {
if (!this.config.overlist) {
this.config.overlist = new Set<string>();
}
this.config.overlist.add(url);
}
public async run(url: string, config: IHttp.HttpConfig): Promise<any> {
const rule = config.rule;
this.connect++;
(this.pool.get(rule.rule) as IRuleParams).connect += 1;
this.logger.debug(`正在进行请求,目前请求数量:${this.connect}:url:${url}`);
let hasErr = false;
try {
const $config: IHttp.HttpConfig | false = await this.callMiddleware({
url,
...this.config,
...config,
rootConfig: this.config,
});
if ($config === false) {
this.logger.debug(`网络处理中间件阻止继续执:${url}`);
hasErr = true;
throw new Error('middleware return false');
}
let response = await this.request(url, {
jar: false,
encoding: null,
...$config,
});
const result = response.body;
const data: IHttp.Result = {
url,
config: { ...$config, response: response },
data: result,
};
if (!$config.encoding) {
const charset =
$config.charset || ($config.rule && $config.rule.config.charset);
data.data = this.decode(result, charset);
}
try {
if (typeof data.data === 'string' && /^(\{|\[)/.test(data.data)) {
data.data = JSON.parse(data.data);
}
} catch (_) {
// try
}
this.logger.debug(`网络请求完成:${url}`);
this.emit('complete', data);
} catch (error) {
if (error.message !== 'middleware return false' && config.retry) {
this.push(url, { ...config, retry: config.retry - 1 });
this.emit('error-retry', { url, config, error });
return;
}
this.emit('error', { url, config, error });
} finally {
this.connect--;
const ruleTaskLen = (this.pool.get(rule.rule) as IRuleParams).queue
.length;
const delay = rule.config.delay || this.delay;
if (ruleTaskLen > 0 && delay && !hasErr) {
this.logger.debug(`网络请求等待延迟:${url},${delay}`);
setTimeout(() => {
this.complete(url, config);
}, delay);
} else {
this.complete(url, config);
}
}
}
public appendMiddleware(
fn: IHttp.DownloadMiddleware | IHttp.DownloadMiddleware[]
) {
if (Array.isArray(fn)) {
this.middlewares = this.middlewares.concat(fn);
return;
}
this.middlewares.push(fn);
}
public async callMiddleware(
config: IHttp.HttpMiddlewareConfig
): Promise<IHttp.HttpMiddlewareConfig | false> {
let cfg: IHttp.HttpMiddlewareConfig = config;
for (const fn of this.middlewares) {
const rc: IHttp.HttpMiddlewareConfig | false = await fn(cfg);
if (rc) {
cfg = rc;
} else if (rc === false) {
return false;
}
}
return cfg;
}
public decode(buffer: Buffer, charset?: any) {
if (charset) {
return iconv.decode(buffer, charset);
}
const tmp = iconv.decode(buffer, 'utf8');
try {
charset = /charset=[^"].*"|charset="[^"].*"/.exec(tmp);
charset = charset
.replace('charset=', '')
.replace(/"/g, '')
.replace('-', '')
.trim();
} catch (e) {
charset = 'utf8';
}
if (charset.toLowerCase() === 'utf8') {
return tmp;
}
return iconv.decode(buffer, charset);
}
private complete(url: string, config: IHttp.HttpConfig): void {
// 对应规则的连接数 --
let ruleParam = this.pool.get(config.rule.rule) as IRuleParams;
ruleParam.connect -= 1;
while (this.inspect(url, config)) {
const task = ruleParam.queue.shift();
if (task) {
this.push(task.url, task.config);
} else {
break;
}
}
this.logger.debug(
`当前规则总任务数:${ruleParam.queue.length},当前运行总数量:${this.connect}`
);
if (this.isIdle()) {
this.emit('completeAll');
}
// for (let $rule of Array.from(this.queue.keys())) {
// if (this.connect >= this.maxConnect) {
// return;
// }
// const queue = this.queue.get($rule);
// if (!queue) return;
// while (this.inspect('', { rule: $rule })) {
// const task = queue.shift();
// if (task) {
// this.push(task.url, task.config);
// } else {
// break;
// }
// }
// }
}
// 检测是否空闲
public isIdle() {
for (let rp of Array.from(this.pool.values())) {
let len = rp.queue.length || rp.connect;
if (len) {
return false;
}
}
return true;
}
}
export default Http;