website-scrap-engine
Version:
Configurable website scraper in typescript
75 lines (71 loc) • 2.23 kB
text/typescript
import * as path from 'node:path';
import type {Stats} from 'node:fs';
import {promises} from 'node:fs';
import type {Resource} from '../resource.js';
import {ResourceType} from '../resource.js';
import type {DownloadResource, RequestOptions} from './types.js';
import type {StaticDownloadOptions} from '../options.js';
import {error as errorLogger} from '../logger/logger.js';
const FILE_PREFIX = 'file://';
export async function readOrCopyLocalResource(
res: Resource,
requestOptions: RequestOptions,
options: StaticDownloadOptions
): Promise<DownloadResource | Resource | void> {
if (res.body) {
return res as DownloadResource;
}
if (!res.downloadLink.startsWith(FILE_PREFIX)) {
return res;
}
if (!res.downloadStartTimestamp) {
res.downloadStartTimestamp = Date.now();
res.waitTime = res.downloadStartTimestamp - res.createTimestamp;
}
let fileSrcPath = res.downloadLink.slice(FILE_PREFIX.length);
if (!fileSrcPath) {
return;
}
// index.html handling
let stats: Stats | void = void 0;
if (res.type === ResourceType.Html) {
stats = await promises.stat(fileSrcPath);
if (stats.isDirectory()) {
for (const index of ['index.html', 'index.htm']) {
if (await promises.access(fileSrcPath + '/' + index)
.then(() => true).catch(() => false)) {
fileSrcPath += '/' + index;
break;
}
}
}
}
if (res.type === ResourceType.StreamingBinary) {
const fileDestPath = path.join(res.localRoot ?? options.localRoot, res.savePath);
await promises.copyFile(fileSrcPath, fileDestPath);
} else {
res.body = await promises.readFile(fileSrcPath, {
encoding: res.encoding
});
}
try {
if (!stats) {
stats = await promises.stat(fileSrcPath);
}
if (stats) {
res.meta.headers = {
'last-modified': stats.mtime.toISOString(),
'content-length': stats.size.toString()
};
}
} catch (e) {
errorLogger.warn('stat ' + fileSrcPath, e);
}
res.finishTimestamp = Date.now();
res.downloadTime =
res.finishTimestamp - res.downloadStartTimestamp;
if (res.type === ResourceType.StreamingBinary) {
return;
}
return res as DownloadResource;
}