ts-webcrawler
Version:
A typescript webcrawler library for downloading and parsing webpages
84 lines (83 loc) • 3.06 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Asset = void 0;
const Url_1 = require("./Url");
const node_fetch_1 = __importDefault(require("node-fetch"));
const fs = require('fs');
class Asset {
constructor(url) {
this.url = url instanceof Url_1.Url ? url : new Url_1.Url(url);
this.data = null;
this.loaded = false;
this.code = null;
}
getUrl() {
return this.url;
}
getCode() {
return this.code;
}
getData() {
return this.data;
}
load() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.loaded) {
try {
const response = yield (0, node_fetch_1.default)(this.url.toString());
this.code = response.status;
this.data = yield response.buffer();
this.loaded = true;
}
catch (error) {
// Load error
}
}
return this;
});
}
save(path, filename = '') {
return __awaiter(this, void 0, void 0, function* () {
if (!this.loaded) {
yield this.load();
}
if (this.data == null) {
throw new Error('Data is null');
}
const filePath = path + (filename == '' ? this.url.getFilename() : filename);
console.log(filePath);
const fileStream = fs.createWriteStream(filePath);
yield fileStream.write(this.data);
fileStream.end();
return this;
});
}
static parseSitemapUrl(content) {
const regex = /Sitemap: (.*)/gm;
let m;
let urls = [];
while ((m = regex.exec(content)) !== null) {
// This is necessary to avoid infinite loops with zero-width matches
if (m.index === regex.lastIndex) {
regex.lastIndex++;
}
if (m[1] == null)
continue;
urls.push(m[1]);
}
return urls;
}
}
exports.Asset = Asset;