UNPKG

ts-webcrawler

Version:

A typescript webcrawler library for downloading and parsing webpages

github.com/standahorvath/webcrawler

standahorvath/webcrawler

218 lines (217 loc) • 7.02 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Url = void 0; const Regex_1 = require("../Constants/Regex"); class Url { constructor(url) { this._url = url; this.isValid = Regex_1.absoluteUrlExact.test(url); if (this.isValid) { this._protocol = Url.extractProtocol(this._url); this._host = Url.extractHost(this._url); this._origin = (this._protocol && this._host) ? (this._protocol + this._host) : null; this._path = Url.extractPath(this._url); this._folder = Url.extractFolder(this._url); this._query = Url.extractQuery(this._url) || []; this._hash = Url.extractHash(this._url); this._filename = Url.extractFilename(this._url); this.isPage = Url.isPage(this._url); this.isAsset = Url.isAsset(this._url); } else { this._protocol = null; this._host = null; this._origin = null; this._path = null; this._folder = null; this._query = []; this._hash = null; this._filename = null; this.isPage = false; this.isAsset = false; } } toString() { return this._url; } getFullUrl() { return this._url; } /** * Method returns host of url for example: domain.com * @returns {string|null} Returns host of url */ getHost() { return this._host; } /** * Method returns protocol of url for example: http:// or https:// * @returns {string|null} Returns protocol of url */ getProtocol() { return this._protocol; } /** * Method returns origin of url for example: http://domain.com or https://domain.com * @returns {string|null} Returns origin of url */ getOrigin() { return this._origin; } /** * Method returns path of url for example: /path/to/file.php * @returns {string|null} Returns path of url */ getPath() { return this._path; } /** * Method returns folder of url for example: /path/to/file * @returns {string|null} Returns path of url */ getFolder() { return this._folder; } /** * Method returns query of url as array of objects * @returns {QueryValue[]} Returns query of url as array of objects */ getQuery() { return this._query; } /** * Method returns hash of url for example: #hash * @returns {string|null} Returns hash of url */ getHash() { return this._hash; } /** * Method returns filename of url for example: file.ext * @returns {string|null} Returns filename of url */ getFilename() { return this._filename; } /** * Method returns comparable string of url without hash and query */ getComparable() { return this._url.split('#')[0].split('?')[0]; } // Method returns hash of url static extractHash(url) { if (url.includes('#')) { const pieces = url.split('#'); if (pieces.length > 1) { if (pieces[1].includes('?')) { return pieces[1].split('?')[0]; } return pieces[1]; } } return null; } // Method returns query of url as array of objects static extractQuery(url) { if (url.includes('?')) { const query = url.split('?')[1]; const queryArray = query.split('&'); const queryObjectArray = []; queryArray.forEach((queryItem) => { const queryItemArray = queryItem.split('='); const queryItemObject = {}; queryItemObject[queryItemArray[0]] = queryItemArray[1] ? queryItemArray[1] : ''; if (queryItemArray[1] && queryItemArray[1].includes('#')) { queryItemObject[queryItemArray[0]] = queryItemArray[1].split('#')[0]; } queryObjectArray.push(queryItemObject); }); return queryObjectArray; } return null; } // Method returns filename of url static extractFilename(url) { if (url.includes('/')) { url = url.split('?')[0]; url = url.split('#')[0]; const pieces = url.split('/'); if (pieces.length > 1) { let filename = pieces[pieces.length - 1]; if (filename.includes('.')) { return filename; } return null; } } return null; } // Method returns path of url static extractPath(url) { if (Regex_1.pathNameFromUrl.test(url) && Regex_1.pathNameFromUrl !== null) { const pathNameMatches = url.match(Regex_1.pathNameFromUrl); if (!pathNameMatches) return null; if (!pathNameMatches.length) return null; const pathName = pathNameMatches.pop(); return pathName ? pathName : null; } return null; } // Method returns folder of url static extractFolder(url) { const pathName = Url.extractPath(url); if (pathName == null) return null; if (pathName.includes('/')) { const pieces = pathName.split('/'); pieces.pop(); return pieces.join('/'); } return null; } // Method returns origin of url static extractOrigin(url) { if (url.includes('://')) { return url.split('://')[0] + '://' + url.split('://')[1].split('/')[0]; } return null; } // Method returns host of url static extractHost(url) { if (url.includes('://')) { return url.split('://')[1].split('/')[0]; } return null; } // Method returns protocol of url static extractProtocol(url) { if (url.includes('://')) { return url.split('://')[0] + '://'; } return null; } // Method returns true if url is page static isPage(url) { var _a; if (Url.extractFilename(url) === null) { return true; } const ext = ((_a = Url.extractFilename(url)) === null || _a === void 0 ? void 0 : _a.split('.')[1]) || ''; if (ext === 'html' || ext === 'htm' || ext === 'php' || ext === 'asp' || ext === 'aspx') { return true; } return false; } // Method returns true if url is asset static isAsset(url) { return !Url.isPage(url); } } exports.Url = Url;