UNPKG

@sveltejs/kit

Version:

SvelteKit is the fastest way to build Svelte apps

244 lines (195 loc) • 4.96 kB
import { resolve } from '../../utils/url.js'; import { decode } from './entities.js'; const DOCTYPE = 'DOCTYPE'; const CDATA_OPEN = '[CDATA['; const CDATA_CLOSE = ']]>'; const COMMENT_OPEN = '--'; const COMMENT_CLOSE = '-->'; const TAG_OPEN = /[a-zA-Z]/; const TAG_CHAR = /[a-zA-Z0-9]/; const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/; const WHITESPACE = /[\s\n\r]/; const CRAWLABLE_META_NAME_ATTRS = new Set([ 'og:url', 'og:image', 'og:image:url', 'og:image:secure_url', 'og:video', 'og:video:url', 'og:video:secure_url', 'og:audio', 'og:audio:url', 'og:audio:secure_url', 'twitter:image' ]); /** * @param {string} html * @param {string} base */ export function crawl(html, base) { /** @type {string[]} */ const ids = []; /** @type {string[]} */ const hrefs = []; let i = 0; main: while (i < html.length) { const char = html[i]; if (char === '<') { if (html[i + 1] === '!') { i += 2; if (html.slice(i, i + DOCTYPE.length).toUpperCase() === DOCTYPE) { i += DOCTYPE.length; while (i < html.length) { if (html[i++] === '>') { continue main; } } } // skip cdata if (html.slice(i, i + CDATA_OPEN.length) === CDATA_OPEN) { i += CDATA_OPEN.length; while (i < html.length) { if (html.slice(i, i + CDATA_CLOSE.length) === CDATA_CLOSE) { i += CDATA_CLOSE.length; continue main; } i += 1; } } // skip comments if (html.slice(i, i + COMMENT_OPEN.length) === COMMENT_OPEN) { i += COMMENT_OPEN.length; while (i < html.length) { if (html.slice(i, i + COMMENT_CLOSE.length) === COMMENT_CLOSE) { i += COMMENT_CLOSE.length; continue main; } i += 1; } } } // parse opening tags const start = ++i; if (TAG_OPEN.test(html[start])) { while (i < html.length) { if (!TAG_CHAR.test(html[i])) { break; } i += 1; } const tag = html.slice(start, i).toUpperCase(); /** @type {Record<string, string>} */ const attributes = {}; if (tag === 'SCRIPT' || tag === 'STYLE') { while (i < html.length) { if ( html[i] === '<' && html[i + 1] === '/' && html.slice(i + 2, i + 2 + tag.length).toUpperCase() === tag ) { continue main; } i += 1; } } while (i < html.length) { const start = i; const char = html[start]; if (char === '>') break; if (ATTRIBUTE_NAME.test(char)) { i += 1; while (i < html.length) { if (!ATTRIBUTE_NAME.test(html[i])) { break; } i += 1; } const name = html.slice(start, i).toLowerCase(); while (WHITESPACE.test(html[i])) i += 1; if (html[i] === '=') { i += 1; while (WHITESPACE.test(html[i])) i += 1; let value; if (html[i] === "'" || html[i] === '"') { const quote = html[i++]; const start = i; let escaped = false; while (i < html.length) { if (escaped) { escaped = false; } else { const char = html[i]; if (html[i] === quote) { break; } if (char === '\\') { escaped = true; } } i += 1; } value = html.slice(start, i); } else { const start = i; while (html[i] !== '>' && !WHITESPACE.test(html[i])) i += 1; value = html.slice(start, i); i -= 1; } value = decode(value); attributes[name] = value; } else { i -= 1; } } i += 1; } const { href, id, name, property, rel, src, srcset, content } = attributes; if (href) { if (tag === 'BASE') { base = resolve(base, href); } else if (!rel || !/\bexternal\b/i.test(rel)) { hrefs.push(resolve(base, href)); } } if (id) { ids.push(id); } if (name && tag === 'A') { ids.push(name); } if (src) { hrefs.push(resolve(base, src)); } if (srcset) { let value = srcset; const candidates = []; let insideURL = true; value = value.trim(); for (let i = 0; i < value.length; i++) { if (value[i] === ',' && (!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))) { candidates.push(value.slice(0, i)); value = value.substring(i + 1).trim(); i = 0; insideURL = true; } else if (WHITESPACE.test(value[i])) { insideURL = false; } } candidates.push(value); for (const candidate of candidates) { const src = candidate.split(WHITESPACE)[0]; if (src) hrefs.push(resolve(base, src)); } } if (tag === 'META' && content) { const attr = name ?? property; if (attr && CRAWLABLE_META_NAME_ATTRS.has(attr)) { hrefs.push(resolve(base, content)); } } } } i += 1; } return { ids, hrefs }; }