UNPKG

ts-webcrawler

Version:

A typescript webcrawler library for downloading and parsing webpages

github.com/standahorvath/webcrawler

standahorvath/webcrawler

28 lines (27 loc) • 2.59 kB

JavaScript

"use strict"; // Description: Regex constants Object.defineProperty(exports, "__esModule", { value: true }); exports.headingTags = exports.imagesWithoutAlt = exports.titleTag = exports.metaTag = exports.relativeLinks = exports.relativeUrl = exports.emailExact = exports.email = exports.pathNameFromUrl = exports.absoluteUrlExact = exports.absoluteUrl = void 0; // Absolute url Regex - Match absolute url, case insensitive, match in middle of code exports.absoluteUrl = /(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;\(\)]*[-A-Z0-9+&@#\/%=~_|])/gi; // Absolute url Exact Regex - Match absolute url, case insensitive, exact match only (no match in middle of code) exports.absoluteUrlExact = /^(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9-]+\.)+(?:[a-z]{2,}\.?)))(?::\d{2,5})?(?:[/?#]\S*)?$/i; // Path name from url Regex - Match path name from url exports.pathNameFromUrl = /(?:\w+:)?\/\/[^\/]+([^?#]+)/; // Email Regex - Match email, case insensitive, match in middle of code exports.email = /(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))/gi; // Email Exact Regex - Match email, case insensitive, exact match only (no match in middle of code) exports.emailExact = /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/gi; // Relative url Regex - Match relative url, case insensitive, match in middle of code exports.relativeUrl = /(?:url\(|<(?:a|link|script|img)[^>]+(?:src|href)\s*=\s*)(?!['"]?(?:data|http))['"]?([^'"\)\s>]+)/gi; // Relative links Regex - Match relative links, only links, case insensitive, match in middle of code exports.relativeLinks = /(?:url\(|<(?:a)[^>]+(?:href)\s*=\s*)(?!['"]?(?:http))['"]?([^'"\)\s>]+)/gi; // Meta tag Regex - Match meta tag, case insensitive, match in middle of code exports.metaTag = /<meta[^>]+>/gi; // Title tag regex - Match title tag, case insensitive, match in middle of code exports.titleTag = /<title[^>]*>([^<]+)<\/title>/gi; // Get image tags missing alt atribute // example <img src="image.jpg"> exports.imagesWithoutAlt = /<img\b(?![^>]*\balt=)[^>]*>/gi; // Get all heading tags, from H1 to H6 exports.headingTags = /<h[1-6][^>]*>([^<]+)<\/h[1-6]>/gi;