website-scrap-engine
Version: 
Configurable website scraper in typescript
217 lines (210 loc) • 7.78 kB
text/typescript
import type {BeforeRetryHook, OptionsInit, RequestError, Response} from 'got';
import got, {TimeoutError} from 'got';
import type {DownloadResource, RequestOptions} from './types.js';
import type {Resource} from '../resource.js';
import {generateSavePath, ResourceType} from '../resource.js';
import type {StaticDownloadOptions} from '../options.js';
import * as logger from '../logger/logger.js';
import {isUrlHttp, sleep} from '../util.js';
import URI from 'urijs';
/** Take logs before retry */
export const beforeRetryHook: BeforeRetryHook = (
  error: RequestError,
  retryCount: number | undefined
) => {
  const options = error.options;
  if (!options) {
    return;
  }
  if (!error) {
    logger.retry.warn(retryCount, String(options.url));
    return;
  }
  const url = String(error.options.url);
  if (error instanceof TimeoutError || error.name === 'TimeoutError') {
    (retryCount && retryCount > 1 ? logger.retry.warn : logger.retry.info)
      .call(logger.retry, retryCount, url, error.name, error.code,
        error.message, (error as TimeoutError).event);
  } else {
    (retryCount && retryCount > 1 ? logger.retry.warn : logger.retry.info)
      .call(logger.retry, retryCount, url, error.name, error.code, error.message);
  }
};
export interface DownloadError extends Partial<Error> {
  retryLimitExceeded?: boolean;
  code?: string;
  event?: string;
}
/**
 * workaround for retry premature close on node 12
 * retry on empty body
 *
 * @param url
 * @param options
 */
export async function getRetry(
  url: string,
  options: OptionsInit
): Promise<Response<Buffer | string> | void> {
  let res: Response<Buffer | string> | void = void 0;
  let err: DownloadError | void = void 0, optionsClone: OptionsInit;
  for (let i = 0; i < 25; i++) {
    err = void 0;
    try {
      optionsClone = Object.assign({}, options);
      res = (await got(url, optionsClone)) as Response<Buffer | string>;
      if (!res || !res.body || !res.body.length) {
        logger.retry.warn(i, url, 'manually retry on empty response or body',
          res && res.body);
        continue;
      }
      break;
    } catch (e) {
      // force cast for typescript 4.4
      err = e as DownloadError | void;
      if (err && err.message === 'premature close') {
        logger.retry.warn(i, url, 'manually retry on premature close',
          err.name, err.code, err.event, err.message);
        await sleep(i * 200);
        continue;
      }
      // these events might be accidentally unhandled
      if (err && !err.retryLimitExceeded &&
        (err.name === 'RequestError' || err.name === 'TimeoutError') &&
        // RequestError: Cannot read property 'request' of undefined
        // at Object.exports.default (got\dist\source\core\utils\timed-out.js:56:23)
        // error.code === undefined
        (err.code === 'ETIMEDOUT' || err.code === undefined)) {
        logger.retry.warn(i, url, `manually retry on ${err.event} timeout`,
          err.name, err.code, err.message);
        await sleep(i * 300);
        continue;
      }
      throw e;
    }
  }
  if (err) {
    logger.error.error(url, 'no more retries on premature close or timeout',
      err.message, err.name, err);
    throw err;
  }
  return res;
}
export async function requestForResource(
  res: Resource & { downloadStartTimestamp: number },
  requestOptions: RequestOptions,
  options?: StaticDownloadOptions
): Promise<DownloadResource | Resource | void> {
  const downloadLink: string = encodeURI(decodeURI(res.downloadLink));
  const reqOptions: OptionsInit = Object.assign({}, requestOptions);
  reqOptions.responseType = 'buffer';
  if (res.refUrl && res.refUrl !== downloadLink) {
    const headers = Object.assign({}, reqOptions.headers);
    headers.referer = res.refUrl;
    reqOptions.headers = headers;
  }
  logger.request.info(res.url, downloadLink, res.refUrl,
    res.encoding, res.type);
  const response: Response<string | Buffer> | void =
    await getRetry(downloadLink, reqOptions);
  if (!response) {
    const resource = res as Resource;
    delete resource.downloadStartTimestamp;
    delete resource.waitTime;
    return resource;
  }
  if (!response.body) {
    logger.error.warn('Empty response body:', downloadLink, response);
    return res as Resource;
  }
  res.meta.headers = response.headers;
  logger.response.info(response.statusCode, response.requestUrl, res.url,
    downloadLink, res.refUrl, res.encoding, res.type);
  res.finishTimestamp = Date.now();
  res.downloadTime = res.finishTimestamp - res.downloadStartTimestamp;
  res.redirectedUrl = response.url;
  // https://github.com/website-local/website-scrap-engine/issues/385
  // 2011/11/15
  if (res.redirectedUrl !== res.url) {
    res.redirectedSavePath = generateSavePath(
      URI(res.redirectedUrl),
      res.type === ResourceType.Html,
      !options?.deduplicateStripSearch,
      options?.localSrcRoot);
  }
  res.body = response.body;
  return res;
}
export async function downloadResource(
  res: Resource,
  requestOptions: RequestOptions,
  options: StaticDownloadOptions
): Promise<DownloadResource | Resource | void> {
  if (res.body) {
    return res as DownloadResource;
  }
  if (res.type === ResourceType.StreamingBinary) {
    return res;
  }
  if (!isUrlHttp(res.downloadLink)) {
    return res;
  }
  if (!res.downloadStartTimestamp) {
    res.downloadStartTimestamp = Date.now();
    res.waitTime = res.downloadStartTimestamp - res.createTimestamp;
  }
  let downloadedResource: DownloadResource | Resource | void = await requestForResource(
    res as (Resource & { downloadStartTimestamp: number }), requestOptions, options);
  if (!downloadedResource || !downloadedResource.body) {
    return downloadedResource;
  }
  if (downloadedResource.type === ResourceType.Html) {
    if (options.meta.warnForNonHtml) {
      const headers = downloadedResource.meta.headers;
      if (headers) {
        const contentType =
          headers['content-type'] || headers['Content-Type'];
        let nonHtml = false;
        if (typeof contentType === 'string') {
          nonHtml = !contentType.includes('/html') &&
            !contentType.includes('/xml') &&
            !contentType.includes('application/xhtml+xml');
        } else if (Array.isArray(contentType)) {
          nonHtml = true;
          for (const header of contentType) {
            if (!header.includes('/html') &&
              !header.includes('/xml') &&
              !header.includes('application/xhtml+xml')) {
              nonHtml = false;
              break;
            }
          }
        }
        if (nonHtml) {
          logger.error.warn('Detected non-html content type',
            downloadedResource.downloadLink, downloadedResource.rawUrl, contentType);
        }
      }
    }
    if (options.meta.detectIncompleteHtml &&
      (typeof downloadedResource.body === 'string' ||
        Buffer.isBuffer(downloadedResource.body))) {
      if (!downloadedResource.body.includes(options.meta.detectIncompleteHtml)) {
        logger.error.info('Detected incomplete html, try again',
          downloadedResource.downloadLink);
        downloadedResource = await requestForResource(
          res as (Resource & { downloadStartTimestamp: number }), requestOptions);
      }
      // probably more retries here?
      if (!downloadedResource || typeof downloadedResource.body === 'string' &&
        !downloadedResource.body.includes(options.meta.detectIncompleteHtml)) {
        logger.error.warn('Detected incomplete html twice', res.downloadLink);
        return downloadedResource;
      }
    }
    downloadedResource.finishTimestamp = Date.now();
    downloadedResource.downloadTime =
      downloadedResource.finishTimestamp - res.downloadStartTimestamp;
  }
  return downloadedResource;
}