tldts
Version:
Library to work against complex domain names, subdomains and URIs.
208 lines (195 loc) • 6.47 kB
text/typescript
// NOTE: kept (intentionally) near-identical to packages/tldts-icann/src/suffix-trie.ts.
// They are separate copies rather than a shared helper because the lookup is
// only fast when the typed arrays are module-scope monomorphic globals —
// closing over them (a shared factory) measured ~20% slower. The ICANN build
// also specializes (constant mask, no isIcann/isPrivate). Keep the two in sync.
import {
fastPathLookup,
IPublicSuffix,
ISuffixLookupOptions,
} from 'tldts-core';
import {
edgeChild,
edgeLength,
edgeStart,
exceptionsRoot,
labelText,
nodeFlags,
rulesRoot,
} from './data/trie';
const enum RULE_TYPE {
ICANN = 1,
PRIVATE = 2,
}
// `edgeOffset` (where each label starts in `labelText`), `edgeHash` (djb2 of
// each label) and `wildcardEdge` (each node's '*' edge, or -1) are derived once
// at load instead of being shipped: the bundle then carries only the
// compressible `labelText` + structure, while the lookup binary-searches
// integer hashes. The cost is a single ~1ms pass at first import — cheaper than
// the object trie it replaces. Kept at module scope (not captured in a closure)
// so V8 treats the typed arrays as fast monomorphic globals.
const numberOfNodes = nodeFlags.length;
const numberOfEdges = edgeLength.length;
const edgeOffset = new Uint32Array(numberOfEdges);
const edgeHash = new Uint32Array(numberOfEdges);
const wildcardEdge = new Int32Array(numberOfNodes).fill(-1);
for (let node = 0, offset = 0; node < numberOfNodes; node += 1) {
for (let edge = edgeStart[node]!; edge < edgeStart[node + 1]!; edge += 1) {
edgeOffset[edge] = offset;
const end = offset + edgeLength[edge]!;
let hash = 5381;
for (let i = end - 1; i >= offset; i -= 1) {
hash = (hash * 33) ^ labelText.charCodeAt(i);
}
edgeHash[edge] = hash >>> 0;
if (
edgeLength[edge] === 1 &&
labelText.charCodeAt(offset) === 42 /* '*' */
) {
wildcardEdge[node] = edge;
}
offset = end;
}
}
// Result of the last `walk`, kept in module scope to avoid allocating a match
// object. Safe because lookups are synchronous and read right after `walk`.
let matchNode = -1;
let matchStart = 0;
let matchEnd = 0;
/**
* True if edge `edge`'s label equals `hostname[start, start + length)`.
*/
function labelEquals(
edge: number,
hostname: string,
start: number,
length: number,
): boolean {
if (edgeLength[edge] !== length) {
return false;
}
const offset = edgeOffset[edge]!;
for (let i = 0; i < length; i += 1) {
if (labelText.charCodeAt(offset + i) !== hostname.charCodeAt(start + i)) {
return false;
}
}
return true;
}
/**
* Find the child edge of `node` whose label is `hostname[start, start + length)`.
* Edges are sorted by hash, so binary-search the hash then verify the label
* (scanning the rare run of equal hashes). Returns the edge index or -1.
*/
function findEdge(
node: number,
hash: number,
hostname: string,
start: number,
length: number,
): number {
let lo = edgeStart[node]!;
let hi = edgeStart[node + 1]!;
while (lo < hi) {
const mid = (lo + hi) >>> 1;
const value = edgeHash[mid]!;
if (value < hash) {
lo = mid + 1;
} else if (value > hash) {
hi = mid;
} else {
for (let e = mid; e >= lo && edgeHash[e] === hash; e -= 1) {
if (labelEquals(e, hostname, start, length)) return e;
}
for (let e = mid + 1; e < hi && edgeHash[e] === hash; e += 1) {
if (labelEquals(e, hostname, start, length)) return e;
}
return -1;
}
}
return -1;
}
/**
* Walk `hostname`'s labels right-to-left from `root`, recording the deepest
* node whose flag passes `allowedMask` (with the label boundaries of that match
* in `matchStart`/`matchEnd`). Returns whether any match was found.
*/
function walk(hostname: string, root: number, allowedMask: number): boolean {
let node = root;
let end = hostname.length;
let hash = 5381;
matchNode = -1;
for (let i = hostname.length - 1; i >= 0; i -= 1) {
const code = hostname.charCodeAt(i);
if (code === 46 /* '.' */) {
const start = i + 1;
let edge = findEdge(node, hash >>> 0, hostname, start, end - start);
if (edge === -1) {
edge = wildcardEdge[node]!;
}
if (edge === -1) {
return matchNode !== -1;
}
node = edgeChild[edge]!;
if ((nodeFlags[node]! & allowedMask) !== 0) {
matchNode = node;
matchStart = start;
matchEnd = end;
}
end = i;
hash = 5381;
} else {
hash = (hash * 33) ^ code;
}
}
// Left-most label: hostname[0, end). Same find/descend/record as the loop —
// duplicated rather than folded into the loop (via `i >= -1`) because that
// extra per-character branch measured slightly slower on the hot path.
let edge = findEdge(node, hash >>> 0, hostname, 0, end);
if (edge === -1) {
edge = wildcardEdge[node]!;
}
if (edge !== -1) {
node = edgeChild[edge]!;
if ((nodeFlags[node]! & allowedMask) !== 0) {
matchNode = node;
matchStart = 0;
matchEnd = end;
}
}
return matchNode !== -1;
}
/**
* Check if `hostname` has a valid public suffix in the trie.
*/
export default function suffixLookup(
hostname: string,
options: ISuffixLookupOptions,
out: IPublicSuffix,
): void {
if (fastPathLookup(hostname, options, out)) {
return;
}
const allowedMask =
(options.allowPrivateDomains ? RULE_TYPE.PRIVATE : 0) |
(options.allowIcannDomains ? RULE_TYPE.ICANN : 0);
// Exceptions have priority and strip their own left-most label (e.g. the
// rule '!www.ck' makes the suffix of 'www.ck' be 'ck').
if (walk(hostname, exceptionsRoot, allowedMask)) {
out.isIcann = (nodeFlags[matchNode]! & RULE_TYPE.ICANN) !== 0;
out.isPrivate = (nodeFlags[matchNode]! & RULE_TYPE.PRIVATE) !== 0;
out.publicSuffix = hostname.slice(matchEnd + 1);
return;
}
if (walk(hostname, rulesRoot, allowedMask)) {
out.isIcann = (nodeFlags[matchNode]! & RULE_TYPE.ICANN) !== 0;
out.isPrivate = (nodeFlags[matchNode]! & RULE_TYPE.PRIVATE) !== 0;
out.publicSuffix = hostname.slice(matchStart);
return;
}
// No match: the prevailing '*' rule makes the right-most label the suffix.
out.isIcann = false;
out.isPrivate = false;
const lastDot = hostname.lastIndexOf('.');
out.publicSuffix = lastDot === -1 ? hostname : hostname.slice(lastDot + 1);
}