@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
58 lines • 2.26 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.downloadListOfUrls = downloadListOfUrls;
exports.extractUrls = extractUrls;
exports.tryAbsoluteURL = tryAbsoluteURL;
const tslib_1 = require("tslib");
const ow_1 = tslib_1.__importDefault(require("ow"));
const general_1 = require("./general");
const gotScraping_1 = require("./gotScraping");
/**
* Returns a promise that resolves to an array of urls parsed from the resource available at the provided url.
* Optionally, custom regular expression and encoding may be provided.
*/
async function downloadListOfUrls(options) {
(0, ow_1.default)(options, ow_1.default.object.exactShape({
url: ow_1.default.string.url,
encoding: ow_1.default.optional.string,
urlRegExp: ow_1.default.optional.regExp,
proxyUrl: ow_1.default.optional.string,
}));
const { url, encoding = 'utf8', urlRegExp = general_1.URL_NO_COMMAS_REGEX, proxyUrl } = options;
// Try to detect wrong urls and fix them. Currently, detects only sharing url instead of csv download one.
const match = url.match(/^(https:\/\/docs\.google\.com\/spreadsheets\/d\/(?:\w|-)+)\/?/);
let fixedUrl = url;
if (match) {
fixedUrl = `${match[1]}/gviz/tq?tqx=out:csv`;
}
const { body: string } = await (0, gotScraping_1.gotScraping)({ url: fixedUrl, encoding, proxyUrl });
return extractUrls({ string, urlRegExp });
}
/**
* Collects all URLs in an arbitrary string to an array, optionally using a custom regular expression.
*/
function extractUrls(options) {
(0, ow_1.default)(options, ow_1.default.object.exactShape({
string: ow_1.default.string,
urlRegExp: ow_1.default.optional.regExp,
}));
const lines = options.string.split('\n');
const result = [];
const urlRegExp = options.urlRegExp ?? general_1.URL_NO_COMMAS_REGEX;
for (const line of lines) {
result.push(...(line.match(urlRegExp) ?? []));
}
return result;
}
/**
* Helper function used to validate URLs used when extracting URLs from a page
*/
function tryAbsoluteURL(href, baseUrl) {
try {
return new URL(href, baseUrl).href;
}
catch {
return undefined;
}
}
//# sourceMappingURL=extract-urls.js.map