website-scrap-engine
Version:
Configurable website scraper in typescript
133 lines (130 loc) • 2.17 kB
text/typescript
// subset of https://en.wikipedia.org/wiki/List_of_URI_schemes
export const unProcessableUriSchemes = [
// Official IANA-registered schemes
'about',
'attachment',
'blob',
'cap',
'chrome',
'chrome-extension',
'cid',
'content',
'cvs',
'data',
'dav',
'dns',
'drm',
'ed2k',
'example',
'feed',
'filesystem',
'ftp',
'geo',
'git',
'icon',
'im',
'imap',
'info',
'ipn',
'ipp',
'ipps',
'irc',
'irc6',
'ircs',
'jar',
'ldap',
'ldaps',
'magnet',
'mailserver',
'mailto',
'maps',
'market',
'message',
'mid',
'mms',
'modem',
'ms-help',
'ms-settings',
'mvn',
'news',
'nfs',
'oid',
'pkcs11',
'platform',
'pop',
'redis',
'rediss',
'res',
'resource',
'rmi',
'rsync',
'rtmfp',
'rtmp',
'rtsp',
's3',
'service',
'sftp',
'shttp',
'sip',
'sips',
'skype',
'smb',
'sms',
'snews',
'snmp',
'spotify',
'ssh',
'steam',
'svn',
'tag',
'tel',
'telnet',
'tftp',
'udp',
'unreal',
'urn',
'view-source',
'vnc',
'ws',
'wss',
'xri',
// Unofficial but common URI schemes
'admin',
'app',
'javascript',
'jdbc',
'odbc',
// Unix sockets is supported by got, but not yet supported here
'unix'
];
export const fastUnProcessableUriSchemesMap: Record<string, string[]> = (() => {
const map: Record<string, string[]> = {};
for (let i = 0, item, l = unProcessableUriSchemes.length; i < l; i++) {
item = unProcessableUriSchemes[i];
if (map[item[0]]) {
map[item[0]].push(item);
} else {
map[item[0]] = [item];
}
}
return map;
})();
/**
* Skip unprocessable links
*/
export function skipLinks(url: string): string | void {
if (url.startsWith('#')) {
return;
}
const unProcessableUriSchemeList = fastUnProcessableUriSchemesMap[url[0]];
if (unProcessableUriSchemeList && unProcessableUriSchemeList.length) {
for (let i = 0, item, il, l = unProcessableUriSchemeList.length; i < l; i++) {
item = unProcessableUriSchemeList[i];
il = item.length;
if (url.length > il && url.startsWith(item) && url[il] === ':') {
return;
}
}
}
return url;
}