UNPKG

@crawlee/utils

Version:

A set of shared utilities that can be used by crawlers

656 lines • 26.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.DISCORD_REGEX_GLOBAL = exports.DISCORD_REGEX = exports.PINTEREST_REGEX_GLOBAL = exports.PINTEREST_REGEX = exports.TIKTOK_REGEX_GLOBAL = exports.TIKTOK_REGEX = exports.YOUTUBE_REGEX_GLOBAL = exports.YOUTUBE_REGEX = exports.FACEBOOK_REGEX_GLOBAL = exports.FACEBOOK_REGEX = exports.TWITTER_REGEX_GLOBAL = exports.TWITTER_REGEX = exports.INSTAGRAM_REGEX_GLOBAL = exports.INSTAGRAM_REGEX = exports.LINKEDIN_REGEX_GLOBAL = exports.LINKEDIN_REGEX = exports.EMAIL_REGEX_GLOBAL = exports.EMAIL_REGEX = void 0; exports.emailsFromText = emailsFromText; exports.emailsFromUrls = emailsFromUrls; exports.phonesFromText = phonesFromText; exports.phonesFromUrls = phonesFromUrls; exports.parseHandlesFromHtml = parseHandlesFromHtml; const tslib_1 = require("tslib"); const cheerio = tslib_1.__importStar(require("cheerio")); const cheerio_1 = require("./cheerio"); // Regex inspired by https://zapier.com/blog/extract-links-email-phone-regex/ const EMAIL_REGEX_STRING = '(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\\])'; /** * Regular expression to exactly match a single email address. * It has the following form: `/^...$/i`. */ exports.EMAIL_REGEX = new RegExp(`^${EMAIL_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple email addresses in a text. * It has the following form: `/.../ig`. */ exports.EMAIL_REGEX_GLOBAL = new RegExp(EMAIL_REGEX_STRING, 'ig'); const EMAIL_URL_PREFIX_REGEX = /^mailto:/i; /** * The function extracts email addresses from a plain text. * Note that the function preserves the order of emails and keep duplicates. * @param text Text to search in. * @return Array of emails addresses found. * If no emails are found, the function returns an empty array. */ function emailsFromText(text) { if (typeof text !== 'string') return []; return text.match(exports.EMAIL_REGEX_GLOBAL) || []; } /** * The function extracts email addresses from a list of URLs. * Basically it looks for all `mailto:` URLs and returns valid email addresses from them. * Note that the function preserves the order of emails and keep duplicates. * @param urls Array of URLs. * @return Array of emails addresses found. * If no emails are found, the function returns an empty array. */ function emailsFromUrls(urls) { if (!Array.isArray(urls)) throw new Error('The "urls" parameter must be an array'); const emails = []; for (const url of urls) { if (!url) continue; if (!EMAIL_URL_PREFIX_REGEX.test(url)) continue; const email = url.replace(EMAIL_URL_PREFIX_REGEX, '').trim(); if (exports.EMAIL_REGEX.test(email)) emails.push(email); } return emails; } // Supports URLs starting with `tel://`, `tel:/` and `tel:`, and similarly `phone`, `telephone` and `callto` const PHONE_URL_PREFIX_REGEX = /^(tel|phone|telephone|callto):(\/)?(\/)?/i; // It's pretty much impossible (and unmaintainable) to have just one large regular expression for all possible phone numbers. // So here we define various regular expression for typical phone number patterns, which are then used to compile // a single large regular expressions. Add more patterns as needed. // NOTE: The patterns are tested in the order as written below, so the longer ones should be before the shorter ones! const PHONE_REGEXS_STRINGS = [ // 775123456 '[0-9]{6,15}', // 1(413)555-2378 or 1(413)555.2378 or 1 (413) 555-2378 or 1 (413) 555 2378 or (303) 494-2320 '([0-9]{1,4}( )?)?\\([0-9]{2,4}\\)( )?[0-9]{2,4}(( )?(-|.))?( )?[0-9]{2,6}', // (51) 5667-9987 or (19)94138-9398 '\\([0-9]{2}\\)( )?[0-9]{4,5}-[0-9]{4}', // 1(262) 955-95-79 or 1(262)955.95.79 '([0-9]{1,4}( )?)?\\([0-9]{2,4}\\)( )?[0-9]{2,4}(( )?(-|.))?( )?[0-9]{2,6}', // 413-577-1234-564 '[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,6}', // 413-577-1234 '[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,6}', // 413-577 '[0-9]{2,4}-[0-9]{2,6}', // 413.577.1234.564 '[0-9]{2,4}\\.[0-9]{2,4}\\.[0-9]{2,4}\\.[0-9]{2,6}', // 413.577.1234 '[0-9]{2,4}\\.[0-9]{2,4}\\.[0-9]{2,6}', // 413.577 '[0-9]{2,4}\\.[0-9]{2,6}', // 413 577 1234 564 '[0-9]{2,4} [0-9]{2,4} [0-9]{2,4} [0-9]{2,6}', // 413 577 1234 '[0-9]{2,4} [0-9]{2,4} [0-9]{2,6}', // 123 4567 '[0-9]{2,4} [0-9]{3,8}', // All phones might be prefixed with '+' or '00' ].map((regex) => `(00|\\+)?${regex}`); // The minimum number of digits a phone number can contain. // That's because the PHONE_REGEXS_STRINGS patterns are quite wide and report a lot of false positives. const PHONE_MIN_DIGITS = 7; // These are patterns that might be matched by PHONE_REGEXS_STRINGS, // but which are most likely not phone numbers. Add more patterns as needed. const SKIP_PHONE_REGEXS = [ // 2018-11-10 '^[0-9]{4}-[0-9]{2}-[0-9]{2}$', ]; const PHONE_REGEX_GLOBAL = new RegExp(`(${PHONE_REGEXS_STRINGS.join('|')})`, 'ig'); const PHONE_REGEX = new RegExp(`^(${PHONE_REGEXS_STRINGS.join('|')})$`, 'i'); const SKIP_PHONE_REGEX = new RegExp(`^(${SKIP_PHONE_REGEXS.join('|')})$`, 'i'); /** * The function attempts to extract phone numbers from a text. Please note that * the results might not be accurate, since phone numbers appear in a large variety of formats and conventions. * If you encounter some problems, please [file an issue](https://github.com/apify/crawlee/issues). * @param text Text to search the phone numbers in. * @return Array of phone numbers found. * If no phone numbers are found, the function returns an empty array. */ function phonesFromText(text) { if (typeof text !== 'string') return []; let phones = text.match(PHONE_REGEX_GLOBAL) || []; phones = phones.filter((phone) => { if (!phone) return false; // Skip too short phones, they are most likely incorrect if (phone.match(/[0-9]/g).length < PHONE_MIN_DIGITS) return false; // Skip phone numbers matching specific patterns if (SKIP_PHONE_REGEX.test(phone)) return false; return true; }); return phones; } /** * Finds phone number links in an array of URLs and extracts the phone numbers from them. * Note that the phone number links look like `tel://123456789`, `tel:/123456789` or `tel:123456789`. * @param urls Array of URLs. * @return Array of phone numbers found. * If no phone numbers are found, the function returns an empty array. */ function phonesFromUrls(urls) { if (!Array.isArray(urls)) throw new Error('The "urls" parameter must be an array'); const phones = []; for (const url of urls) { if (!url) continue; if (!PHONE_URL_PREFIX_REGEX.test(url)) continue; const phone = url.replace(PHONE_URL_PREFIX_REGEX, '').trim(); if (PHONE_REGEX.test(phone)) phones.push(phone); } return phones; } // NOTEs about the regular expressions // - They have just a single matching group for the profile username, all other groups are non-matching // - They use a negative lookbehind and lookahead assertions, which are only supported in Node 8+. // They are used to prevent matching URLs in strings like "blahttps://www.example.com" const LINKEDIN_REGEX_STRING = '(?<!\\w)(?:(?:http(?:s)?:\\/\\/)?(?:(?:(?:[a-z]+\\.)?linkedin\\.com\\/(?:in|company)\\/)([a-z0-9\\-_%=]{2,60})(?![a-z0-9\\-_%=])))(?:\\/)?'; const INSTAGRAM_REGEX_STRING = '(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:(?:www\\.)?(?:instagram\\.com|instagr\\.am)\\/)(?!explore|_n|_u)([a-z0-9_.]{2,30})(?![a-z0-9_.])(?:/)?'; const TWITTER_RESERVED_PATHS = 'oauth|account|tos|privacy|signup|home|hashtag|search|login|widgets|i|settings|start|share|intent|oct|messages|explore|notifications|jobs|compose\\/post'; const X_SUBDOMAINS = 'business|help|about|blog|careers|developer|ads'; const TWITTER_REGEX_STRING = `(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:www.)?(?<!(?:${X_SUBDOMAINS})\\.)(?:x|twitter)(?:.com)\\/(?!(?:${TWITTER_RESERVED_PATHS})(?:[\\'\\"\\?\\.\\/]|$))(?:@)?([a-z0-9_]{1,15})(?![a-z0-9_])(?:/)?`; const FACEBOOK_RESERVED_PATHS = 'rsrc\\.php|apps|groups|events|l\\.php|friends|images|photo.php|chat|ajax|dyi|common|policies|login|recover|reg|help|security|messages|marketplace|pages\\/(?:create|merge|search)|live|bookmarks|games|fundraisers|saved|gaming|salesgroups|jobs|people|ads|ad_campaign|weather|offers|recommendations|crisisresponse|onthisday|developers|settings|connect|business|plugins|intern|sharer'; const FACEBOOK_REGEX_STRING = `(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:www.)?(?:facebook.com|fb.com)\\/(?!(?:${FACEBOOK_RESERVED_PATHS})(?:[\\'\\"\\?\\.\\/]|$))(profile\\.php\\?id\\=[0-9]{3,20}|pages\\/[a-z0-9-_][a-z0-9-_\\.\\/]{0,150}|(?!pages|profile\\.php)[a-z0-9-\\.]{5,51}(?![a-z0-9\\.]))(?:\\/)?`; const YOUTUBE_REGEX_STRING = '(?<!\\w)(?:https?:\\/\\/)?(?:youtu\\.be\\/|(?:www\\.|m\\.)?youtube\\.com(?:(?:(?:\\/(?:watch|v|embed|user|c(?:hannel)?)(?:\\.php)?)?(?:\\?[^ ]*v=|\\/))|(?:(?:\\/c)?\\/@)))([a-zA-Z0-9\\-_]{2,100})'; const TIKTOK_REGEX_STRING = '(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:(?:www|m)\\.)?(?:tiktok\\.com)\\/(((?:(?:v|embed|trending)(?:\\?shareId=|\\/))[0-9]{2,50}(?![0-9]))|(?:@)[a-z0-9\\-_\\.]+((?:\\/video\\/)[0-9]{2,50}(?![0-9]))?)(?:\\/)?'; const PINTEREST_REGEX_STRING = '(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:(?:(?:(?:www\\.)?pinterest(?:\\.com|(?:\\.[a-z]{2}){1,2}))|(?:[a-z]{2})\\.pinterest\\.com)(?:\\/))((pin\\/[0-9]{2,50})|((?!pin)[a-z0-9\\-_\\.]+(\\/[a-z0-9\\-_\\.]+)?))(?:\\/)?'; const DISCORD_REGEX_STRING = '(?<!\\w)(?:https?:\\/\\/)?(?:www\\.)?((?:(?:(?:canary|ptb).)?(?:discord|discordapp)\\.com\\/channels(?:\\/)[0-9]{2,50}(\\/[0-9]{2,50})*)|(?:(?:(?:canary|ptb).)?(?:discord\\.(?:com|me|li|gg|io)|discordapp\\.com)(?:\\/invite)?)\\/(?!channels)[a-z0-9\\-_]{2,50})(?:\\/)?'; /** * Regular expression to exactly match a single LinkedIn profile URL. * It has the following form: `/^...$/i` and matches URLs such as: * ``` * https://www.linkedin.com/in/alan-turing * en.linkedin.com/in/alan-turing * linkedin.com/in/alan-turing * https://www.linkedin.com/company/linkedin/ * ``` * * The regular expression does NOT match URLs with additional * subdirectories or query parameters, such as: * ``` * https://www.linkedin.com/in/linus-torvalds/latest-activity * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * if (social.LINKEDIN_REGEX.test('https://www.linkedin.com/in/alan-turing')) { * console.log('Match!'); * } * ``` */ exports.LINKEDIN_REGEX = new RegExp(`^${LINKEDIN_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple LinkedIn profile URLs in a text or HTML. * It has the following form: `/.../ig` and matches URLs such as: * ``` * https://www.linkedin.com/in/alan-turing * en.linkedin.com/in/alan-turing * linkedin.com/in/alan-turing * https://www.linkedin.com/company/linkedin/ * ``` * * If the profile URL contains subdirectories or query parameters, the regular expression * extracts just the base part of the profile URL. For example, from text such as: * ``` * https://www.linkedin.com/in/linus-torvalds/latest-activity * ``` * the expression extracts just the following base URL: * ``` * https://www.linkedin.com/in/linus-torvalds * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * const matches = text.match(social.LINKEDIN_REGEX_GLOBAL); * if (matches) console.log(`${matches.length} LinkedIn profiles found!`); * ``` */ exports.LINKEDIN_REGEX_GLOBAL = new RegExp(LINKEDIN_REGEX_STRING, 'ig'); /** * Regular expression to exactly match a single Instagram profile URL. * It has the following form: `/^...$/i` and matches URLs such as: * ``` * https://www.instagram.com/old_prague * www.instagram.com/old_prague/ * instagr.am/old_prague * ``` * * The regular expression does NOT match URLs with additional * subdirectories or query parameters, such as: * ``` * https://www.instagram.com/cristiano/followers * ``` * * It also does NOT match the following URLs: * ``` * https://www.instagram.com/explore/ * https://www.instagram.com/_n/ * https://www.instagram.com/_u/ * * Example usage: * ``` * import { social } from 'crawlee'; * * if (social.INSTAGRAM_REGEX.test('https://www.instagram.com/old_prague')) { * console.log('Match!'); * } * ``` */ exports.INSTAGRAM_REGEX = new RegExp(`^${INSTAGRAM_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple Instagram profile URLs in a text or HTML. * It has the following form: `/.../ig` and matches URLs such as: * ``` * https://www.instagram.com/old_prague * www.instagram.com/old_prague/ * instagr.am/old_prague * ``` * * If the profile URL contains subdirectories or query parameters, the regular expression * extracts just the base part of the profile URL. For example, from text such as: * ``` * https://www.instagram.com/cristiano/followers * ``` * the expression extracts just the following base URL: * ``` * https://www.instagram.com/cristiano * ``` * * The regular expression does NOT match the following URLs: * ``` * https://www.instagram.com/explore/ * https://www.instagram.com/_n/ * https://www.instagram.com/_u/ * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * const matches = text.match(social.INSTAGRAM_REGEX_GLOBAL); * if (matches) console.log(`${matches.length} Instagram profiles found!`); * ``` */ exports.INSTAGRAM_REGEX_GLOBAL = new RegExp(INSTAGRAM_REGEX_STRING, 'ig'); /** * Regular expression to exactly match a single Twitter profile URL. * It has the following form: `/^...$/i` and matches URLs such as: * ``` * https://www.twitter.com/apify * twitter.com/apify * ``` * * The regular expression does NOT match URLs with additional * subdirectories or query parameters, such as: * ``` * https://www.twitter.com/realdonaldtrump/following * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * if (social.TWITTER_REGEX.test('https://www.twitter.com/apify')) { * console.log('Match!'); * } * ``` */ exports.TWITTER_REGEX = new RegExp(`^${TWITTER_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple Twitter profile URLs in a text or HTML. * It has the following form: `/.../ig` and matches URLs such as: * ``` * https://www.twitter.com/apify * twitter.com/apify * ``` * * If the profile URL contains subdirectories or query parameters, the regular expression * extracts just the base part of the profile URL. For example, from text such as: * ``` * https://www.twitter.com/realdonaldtrump/following * ``` * the expression extracts only the following base URL: * ``` * https://www.twitter.com/realdonaldtrump * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * const matches = text.match(social.TWITTER_REGEX_STRING); * if (matches) console.log(`${matches.length} Twitter profiles found!`); * ``` */ exports.TWITTER_REGEX_GLOBAL = new RegExp(TWITTER_REGEX_STRING, 'ig'); /** * Regular expression to exactly match a single Facebook profile URL. * It has the following form: `/^...$/i` and matches URLs such as: * ``` * https://www.facebook.com/apifytech * facebook.com/apifytech * fb.com/apifytech * https://www.facebook.com/profile.php?id=123456789 * ``` * * The regular expression does NOT match URLs with additional * subdirectories or query parameters, such as: * ``` * https://www.facebook.com/apifytech/photos * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * if (social.FACEBOOK_REGEX.test('https://www.facebook.com/apifytech')) { * console.log('Match!'); * } * ``` */ exports.FACEBOOK_REGEX = new RegExp(`^${FACEBOOK_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple Facebook profile URLs in a text or HTML. * It has the following form: `/.../ig` and matches URLs such as: * ``` * https://www.facebook.com/apifytech * facebook.com/apifytech * fb.com/apifytech * ``` * * If the profile URL contains subdirectories or query parameters, the regular expression * extracts just the base part of the profile URL. For example, from text such as: * ``` * https://www.facebook.com/apifytech/photos * ``` * the expression extracts only the following base URL: * ``` * https://www.facebook.com/apifytech * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * const matches = text.match(social.FACEBOOK_REGEX_GLOBAL); * if (matches) console.log(`${matches.length} Facebook profiles found!`); * ``` */ exports.FACEBOOK_REGEX_GLOBAL = new RegExp(FACEBOOK_REGEX_STRING, 'ig'); /** * Regular expression to exactly match a single Youtube channel, user or video URL. * It has the following form: `/^...$/i` and matches URLs such as: * ``` * https://www.youtube.com/watch?v=kM7YfhfkiEE * https://youtu.be/kM7YfhfkiEE * https://www.youtube.com/c/TrapNation * https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA * https://www.youtube.com/user/pewdiepie * ``` * * Please note that this won't match URLs like https://www.youtube.com/pewdiepie that redirect to /user or /channel. * * Example usage: * ``` * import { social } from 'crawlee'; * * if (social.YOUTUBE_REGEX.test('https://www.youtube.com/watch?v=kM7YfhfkiEE')) { * console.log('Match!'); * } * ``` */ exports.YOUTUBE_REGEX = new RegExp(`^${YOUTUBE_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple Youtube channel, user or video URLs in a text or HTML. * It has the following form: `/.../ig` and matches URLs such as: * ``` * https://www.youtube.com/watch?v=kM7YfhfkiEE * https://youtu.be/kM7YfhfkiEE * https://www.youtube.com/c/TrapNation * https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA * https://www.youtube.com/user/pewdiepie * ``` * * Please note that this won't match URLs like https://www.youtube.com/pewdiepie that redirect to /user or /channel. * * Example usage: * ``` * import { social } from 'crawlee'; * * const matches = text.match(social.YOUTUBE_REGEX_GLOBAL); * if (matches) console.log(`${matches.length} Youtube videos found!`); * ``` */ exports.YOUTUBE_REGEX_GLOBAL = new RegExp(YOUTUBE_REGEX_STRING, 'ig'); /** * Regular expression to exactly match a Tiktok video or user account. * It has the following form: `/^...$/i` and matches URLs such as: * ``` * https://www.tiktok.com/trending?shareId=123456789 * https://www.tiktok.com/embed/123456789 * https://m.tiktok.com/v/123456789 * https://www.tiktok.com/@user * https://www.tiktok.com/@user-account.pro * https://www.tiktok.com/@user/video/123456789 * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * if (social.TIKTOK_REGEX.test('https://www.tiktok.com/trending?shareId=123456789')) { * console.log('Match!'); * } * ``` */ exports.TIKTOK_REGEX = new RegExp(`^${TIKTOK_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple Tiktok videos or user accounts in a text or HTML. * It has the following form: `/.../ig` and matches URLs such as: * ``` * https://www.tiktok.com/trending?shareId=123456789 * https://www.tiktok.com/embed/123456789 * https://m.tiktok.com/v/123456789 * https://www.tiktok.com/@user * https://www.tiktok.com/@user-account.pro * https://www.tiktok.com/@user/video/123456789 * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * const matches = text.match(social.TIKTOK_REGEX_GLOBAL); * if (matches) console.log(`${matches.length} Tiktok profiles/videos found!`); * ``` */ exports.TIKTOK_REGEX_GLOBAL = new RegExp(TIKTOK_REGEX_STRING, 'ig'); /** * Regular expression to exactly match a Pinterest pin, user or user's board. * It has the following form: `/^...$/i` and matches URLs such as: * ``` * https://pinterest.com/pin/123456789 * https://www.pinterest.cz/pin/123456789 * https://www.pinterest.com/user * https://uk.pinterest.com/user * https://www.pinterest.co.uk/user * pinterest.com/user_name.gold * https://cz.pinterest.com/user/board * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * if (social.PINTEREST_REGEX.test('https://pinterest.com/pin/123456789')) { * console.log('Match!'); * } * ``` */ exports.PINTEREST_REGEX = new RegExp(`^${PINTEREST_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple Pinterest pins, users or boards in a text or HTML. * It has the following form: `/.../ig` and matches URLs such as: * ``` * https://pinterest.com/pin/123456789 * https://www.pinterest.cz/pin/123456789 * https://www.pinterest.com/user * https://uk.pinterest.com/user * https://www.pinterest.co.uk/user * pinterest.com/user_name.gold * https://cz.pinterest.com/user/board * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * const matches = text.match(social.PINTEREST_REGEX_GLOBAL); * if (matches) console.log(`${matches.length} Pinterest pins found!`); * ``` */ exports.PINTEREST_REGEX_GLOBAL = new RegExp(PINTEREST_REGEX_STRING, 'ig'); /** * Regular expression to exactly match a Discord invite or channel. * It has the following form: `/^...$/i` and matches URLs such as: * ``` * https://discord.gg/discord-developers * https://discord.com/invite/jyEM2PRvMU * https://discordapp.com/channels/1234 * https://discord.com/channels/1234/1234 * discord.gg/discord-developers * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * if (social.DISCORD_REGEX.test('https://discord.gg/discord-developers')) { * console.log('Match!'); * } * ``` */ exports.DISCORD_REGEX = new RegExp(`^${DISCORD_REGEX_STRING}$`, 'i'); /** * Regular expression to find multiple Discord channels or invites in a text or HTML. * It has the following form: `/.../ig` and matches URLs such as: * ``` * https://discord.gg/discord-developers * https://discord.com/invite/jyEM2PRvMU * https://discordapp.com/channels/1234 * https://discord.com/channels/1234/1234 * discord.gg/discord-developers * ``` * * Example usage: * ``` * import { social } from 'crawlee'; * * const matches = text.match(social.DISCORD_REGEX_GLOBAL); * if (matches) console.log(`${matches.length} Discord channels found!`); * ``` */ exports.DISCORD_REGEX_GLOBAL = new RegExp(DISCORD_REGEX_STRING, 'ig'); /** * The function attempts to extract emails, phone numbers and social profile URLs from a HTML document, * specifically LinkedIn, Twitter, Instagram and Facebook profile URLs. * The function removes duplicates from the resulting arrays and sorts the items alphabetically. * * Note that the `phones` field contains phone numbers extracted from the special phone links * such as `[call us](tel:+1234556789)` (see {@link phonesFromUrls}) * and potentially other sources with high certainty, while `phonesUncertain` contains phone numbers * extracted from the plain text, which might be very inaccurate. * * **Example usage:** * ```typescript * import { launchPuppeteer, social } from 'crawlee'; * * const browser = await launchPuppeteer(); * const page = await browser.newPage(); * await page.goto('http://www.example.com'); * const html = await page.content(); * * const result = social.parseHandlesFromHtml(html); * console.log('Social handles:'); * console.dir(result); * ``` * * @param html HTML text * @param [data] Optional object which will receive the `text` and `$` properties * that contain text content of the HTML and `cheerio` object, respectively. This is an optimization * so that the caller doesn't need to parse the HTML document again, if needed. * @return An object with the social handles. */ function parseHandlesFromHtml(html, data = null) { const result = { emails: [], phones: [], phonesUncertain: [], linkedIns: [], twitters: [], instagrams: [], facebooks: [], youtubes: [], tiktoks: [], pinterests: [], discords: [], }; if (typeof html !== 'string') return result; const $ = cheerio.load(html, { decodeEntities: true }); if (data) data.$ = $; const text = (0, cheerio_1.htmlToText)($); if (data) data.text = text; // NOTE: we need to parse each text separately, orherwise we might concatenate unrelated texts // e.g. `<div>6HT<a>eva@example.com</a></div>` would become 6HTeva@example.com const texts = $('*') .contents() .toArray() .filter((node) => node.type === 'text') .map((node) => $(node).text().trim()); // Find all <a> links with href tag const linkUrls = []; $('a[href]').each((_index, elem) => { if (elem) linkUrls.push($(elem).attr('href')); }); result.emails = emailsFromUrls(linkUrls).concat(texts.flatMap(emailsFromText)); result.phones = phonesFromUrls(linkUrls); result.phonesUncertain = phonesFromText(text); // Note that these regexps extract just the base profile path. For example for // https://www.linkedin.com/in/carl-newman-123456a/detail/recent-activity/ // they match just: // https://www.linkedin.com/in/carl-newman-123456a result.linkedIns = html.match(exports.LINKEDIN_REGEX_GLOBAL) || []; result.twitters = html.match(exports.TWITTER_REGEX_GLOBAL) || []; result.instagrams = html.match(exports.INSTAGRAM_REGEX_GLOBAL) || []; result.facebooks = html.match(exports.FACEBOOK_REGEX_GLOBAL) || []; result.youtubes = html.match(exports.YOUTUBE_REGEX_GLOBAL) || []; result.tiktoks = html.match(exports.TIKTOK_REGEX_GLOBAL) || []; result.pinterests = html.match(exports.PINTEREST_REGEX_GLOBAL) || []; result.discords = html.match(exports.DISCORD_REGEX_GLOBAL) || []; // Sort and deduplicate handles for (const key of Object.keys(result)) { result[key].sort(); result[key] = [...new Set(result[key])].sort(); } return result; } //# sourceMappingURL=social.js.map