tldts-experimental
Version:
Library to work against complex domain names, subdomains and URIs.
279 lines (246 loc) • 8.54 kB
text/typescript
import {
fastPathLookup,
IPublicSuffix,
ISuffixLookupOptions,
} from 'tldts-core';
import packed from './data/hashes';
/**
* Find `elt` in `arr` between indices `start` (included) and `end` (excluded)
* using a binary search algorithm.
*/
function binSearch(
arr: Uint32Array,
elt: number,
start: number,
end: number,
): boolean {
if (start >= end) {
return false;
}
let low = start;
let high = end - 1;
while (low <= high) {
const mid = (low + high) >>> 1;
const midVal = arr[mid]!;
if (midVal < elt) {
low = mid + 1;
} else if (midVal > elt) {
high = mid - 1;
} else {
return true;
}
}
return false;
}
// Packed hash algorithm makes use of a rolling hash to lookup suffixes. To
// avoid having to allocate an array to store them at every invocation, we
// create one global one that can be reused.
const BUFFER = new Uint32Array(20);
/**
* Iterate on hashes of labels from `hostname` backward (from last label to
* first label), stopping after `maximumNumberOfLabels` have been extracted and
* calling `cb` on each of them.
*
* The `maximumNumberOfLabels` argument is typically used to specify the number
* of labels seen in the longest public suffix. We do not need to check further
* in very long hostnames.
*/
function hashHostnameLabelsBackward(
hostname: string,
maximumNumberOfLabels: number,
): number {
let hash = 5381;
let index = 0;
// Compute hash backward, label per label
for (let i = hostname.length - 1; i >= 0; i -= 1) {
const code = hostname.charCodeAt(i);
// Process label
if (code === 46 /* '.' */) {
BUFFER[index << 1] = hash >>> 0;
BUFFER[(index << 1) + 1] = i + 1;
index += 1;
if (index === maximumNumberOfLabels) {
return index;
}
}
// Update hash
hash = (hash * 33) ^ code;
}
// Let's not forget about last label
BUFFER[index << 1] = hash >>> 0;
BUFFER[(index << 1) + 1] = 0;
index += 1;
return index;
}
const enum Result {
NO_MATCH = 0,
ICANN_MATCH = 1,
PRIVATE_MATCH = 2,
EXCEPTION_MATCH = 4,
NORMAL_MATCH = 8,
WILDCARD_MATCH = 16,
}
/**
* Perform a public suffix lookup for `hostname` using the packed hashes
* data-structure. The `options` allows to specify if ICANN/PRIVATE sections
* should be considered. By default, both are.
*
*/
export default function suffixLookup(
hostname: string,
options: ISuffixLookupOptions,
out: IPublicSuffix,
): void {
if (fastPathLookup(hostname, options, out)) {
return;
}
const { allowIcannDomains, allowPrivateDomains } = options;
// Keep track of longest match
let matchIndex = -1;
let matchKind = Result.NO_MATCH;
let matchLabels = 0; // Keep track of number of labels currently matched
// Index in the packed array data-structure
let index = 1;
const numberOfHashes = hashHostnameLabelsBackward(
hostname,
packed[0]! /* maximumNumberOfLabels */,
);
for (let label = 0; label < numberOfHashes; label += 1) {
const hash = BUFFER[label << 1]!;
const labelStart = BUFFER[(label << 1) + 1]!;
// For each label, matching proceeds in the following way:
//
// 1. check exceptions
// 2. check wildcards
// 3. check normal rules
//
// For each of these, we also perform the lookup in two parts, once for
// the ICANN section and one for the PRIVATE section. Both of which are
// optional and can be enabled/disabled using the `options` argument.
//
// We start with exceptions because if an exception is found, we do not
// need to continue matching wildcards or normal rules; the exception will
// always have priority.
//
// Similarly, if we find a wildcard match, we do not need to check the
// rules for the same label as the wildcard match is always longer (one
// more label is matched).
//
// **WARNING**: the structure of this code follows exactly the structure
// of the packed data structure as create in ./bin/builders/hashes.js
let match = Result.NO_MATCH;
// ========================================================================
// Lookup exceptions
// ========================================================================
// ICANN
if (allowIcannDomains) {
match = binSearch(packed, hash, index + 1, index + packed[index]! + 1)
? Result.ICANN_MATCH | Result.EXCEPTION_MATCH
: Result.NO_MATCH;
}
index += packed[index]! + 1;
// PRIVATE
if (allowPrivateDomains && match === Result.NO_MATCH) {
match = binSearch(packed, hash, index + 1, index + packed[index]! + 1)
? Result.PRIVATE_MATCH | Result.EXCEPTION_MATCH
: Result.NO_MATCH;
}
index += packed[index]! + 1;
// ========================================================================
// Lookup wildcards
// ========================================================================
// ICANN
if (
allowIcannDomains &&
match === Result.NO_MATCH &&
(matchKind & Result.EXCEPTION_MATCH) === 0
) {
match = binSearch(packed, hash, index + 1, index + packed[index]! + 1)
? Result.WILDCARD_MATCH | Result.ICANN_MATCH
: Result.NO_MATCH;
}
index += packed[index]! + 1;
// PRIVATE
if (
allowPrivateDomains &&
match === Result.NO_MATCH &&
(matchKind & Result.EXCEPTION_MATCH) === 0
) {
match = binSearch(packed, hash, index + 1, index + packed[index]! + 1)
? Result.WILDCARD_MATCH | Result.PRIVATE_MATCH
: Result.NO_MATCH;
}
index += packed[index]! + 1;
// ========================================================================
// Lookup rules
// ========================================================================
// ICANN
if (
allowIcannDomains &&
match === Result.NO_MATCH &&
(matchKind & Result.EXCEPTION_MATCH) === 0 &&
matchLabels <= label
) {
match = binSearch(packed, hash, index + 1, index + packed[index]! + 1)
? Result.NORMAL_MATCH | Result.ICANN_MATCH
: Result.NO_MATCH;
}
index += packed[index]! + 1;
// PRIVATE
if (
allowPrivateDomains &&
match === Result.NO_MATCH &&
(matchKind & Result.EXCEPTION_MATCH) === 0 &&
matchLabels <= label
) {
match = binSearch(packed, hash, index + 1, index + packed[index]! + 1)
? Result.NORMAL_MATCH | Result.PRIVATE_MATCH
: Result.NO_MATCH;
}
index += packed[index]! + 1;
// If we found a match, the longest match that is being tracked for this
// hostname. We need to remember which kind of match it was (exception,
// wildcard, normal rule), the index where the suffix starts in `hostname`
// as well as the number of labels contained in this suffix (this is
// important to make sure that we always keep the longest match if there
// are both a wildcard and a normal rule matching).
if (match !== Result.NO_MATCH) {
matchKind = match;
matchLabels = label + ((match & Result.WILDCARD_MATCH) !== 0 ? 2 : 1);
matchIndex = labelStart;
}
}
out.isIcann = (matchKind & Result.ICANN_MATCH) !== 0;
out.isPrivate = (matchKind & Result.PRIVATE_MATCH) !== 0;
// No match found
if (matchIndex === -1) {
out.publicSuffix =
numberOfHashes === 1 ? hostname : hostname.slice(BUFFER[1]);
return;
}
// If match is an exception, this means that we need to count less label.
// For example, exception rule !foo.com would yield suffix 'com', so we need
// to locate the next dot and slice from there.
if ((matchKind & Result.EXCEPTION_MATCH) !== 0) {
out.publicSuffix = hostname.slice(BUFFER[((matchLabels - 2) << 1) + 1]);
return;
}
// If match is a wildcard, we need to match one more label. If wildcard rule
// was *.com, we would have stored only 'com' in the packed structure and we
// need to take one extra label on the left.
if ((matchKind & Result.WILDCARD_MATCH) !== 0) {
if (matchLabels < numberOfHashes) {
out.publicSuffix = hostname.slice(BUFFER[((matchLabels - 1) << 1) + 1]);
return;
}
const parts = hostname.split('.');
while (parts.length > matchLabels) {
parts.shift();
}
out.publicSuffix = parts.join('.');
return;
}
// if ((matchKind & Result.NORMAL_MATCH) !== 0)
// For normal match, we just slice the hostname at the beginning of suffix.
out.publicSuffix = hostname.slice(matchIndex);
}