spec-url
Version:
URL library that implements a reference resolution algorithm for WHATWG URLs
492 lines (374 loc) • 12.5 kB
JavaScript
import { pct } from './characters.js'
import tr46 from 'tr46'
const log = console.log.bind (console)
// Authority and Host Model
// ========================
// Host :=
// IPv6Adddress | DomainName | IPv4Address | String
// The various Host objects are implemented as wrappers around an internal
// representation value. This internal value is stored under the property
// with a Symbol-key `valueKey`, abbreviated to $ for convenience.
const valueKey = Symbol ('$')
const $ = valueKey
// ### IPv6 Address
class URLIPv6Address {
constructor (input) {
this [$] = [0,0,0,0, 0,0,0,0]
switch (typeof input) {
case 'string':
return (this[$] = ipv6.parse (input), this)
case 'object':
if (input && input instanceof URLIPv6Address)
return (this[$] = Array.from (input[$]), this)
default:
throw new Error (`Invalid IPv6 address ${String(input)}`)
}
}
toString () {
return `[${ipv6.print (this [$])}]`
}
}
// ### URLDomainName
class URLDomainName {
// ASSUMES string is a valid unicode domain string
constructor (string) {
// TODO process/ validate the input
this[$] = string
}
// REVIEW should the API for this be mutable instead?
// And I suppose we should eh store the repr (unicode/ascii) too
toASCII () {
const ASCIIString = tr46.toASCII (this[$], toASCIIOptions)
if (ASCIIString != null) return ASCIIString
else throw new Error (`URLDomainName.toASCII: Failed to convert <${this[$]}> to ASCII`)
}
toString () {
return this[$]
}
}
// ### IPv4 Address
class URLIPv4Address {
constructor (input) {
this [$] = 0
switch (typeof input) {
case 'number': if (0 <= input && input <= 0xFF_FF_FF_FF)
return (this[$] = input, this)
break
case 'object': if (input !== null && input instanceof URLIPv4Address)
return (this[$] = input[$], this)
break
case 'string':
const value = ipv4.parse (input)
if (value !== null) return (this[$] = value, this)
break
}
throw new Error (`Invalid IPv4 address: <${input}>`)
}
toString () {
return ipv4.print (this [$])
}
}
// Authority Parsing
// -----------------
// (c) The credentials sigil is the last "@" - if any.
// If it is present then the authority has a username.
// (w) The password sigil is the first ":" before (c).
// If it is present then the authority has a password.
// (p) The port sigil is the first ":" over-all, or
// after (c) if (c) is present. If (p) is present then
// the authority has a port.
// The algorithm makes a single pass from left to right over the input
// to find the positions of the sigils. It uses -1 to indicate absence
// for (c) and (w) and it uses input.length to indicate absence of (p).
// NB This does *not* parse the domain. The host property of
// the return value is either an ipv6 address or an opaque host string.
function parseAuth (input) {
const auth = { }
const len = input.length
let c = -1, w = -1, p = len
let bracks = false
for (let i=0; i<len; i++) switch (input[i]) {
case ']': bracks = false; break
case '[': bracks = true; break
case '@' : (bracks = false, c = i, p = len); break
case ':' :
if (w < 0) w = i
if (!bracks && p >= c) p = i
}
// At this point we have collected the sigil positions
// and we can break the input string into separate components.
const str = input.substring.bind (input)
if (c >= 0) { // has credentials
if (0 <= w && w < c) { // has password
auth.user = str (0, w)
auth.pass = str (w + 1, c)
}
else
auth.user = str (0, c)
}
auth.host = input[c+1] === '['
? new URLIPv6Address (str(c+1, p))
: str(c+1, p)
if (p < len) try {
auth.port = parsePort (str (p + 1))
}
catch (e) {
throw new Error (`Invalid port string in authority //${input}`)
}
// Check structural invariants
const errs = authErrors (auth)
if (errs) {
const message = '\n\t- ' + errs.join ('\n\t- ') + '\n'
throw new Error (`Invalid authority //${input} ${message}`)
}
return auth
}
// ### Authority - Structural invariants
function authErrors (auth) {
const errs = []
const noHost = auth.host == null || auth.host === ''
if (noHost && auth.port != null)
errs.push (`An authority with an empty hostname cannot have a port`)
if (noHost && (auth.user != null || auth.pass != null))
errs.push (`An authority with an empty hostname cannot have credentials`)
if (auth.pass != null && auth.user == null)
errs.push (`An authority without a username cannot have a password`)
return errs.length ? errs : null
}
// ### Port
// A port may either be the empty string,
// or the decimal representation of a number n < 2**16.
function parsePort (input) {
if (input === '') return input
if (/^[0-9]+$/.test (input)) {
const port = +input
if (input < 2**16) return port
}
throw new Error (`Invalid port-string: "${input}"`)
}
// Host Parsing
// ------------
function parseHost (input, { parseDomain = false, percentCoded = true } = { }) {
return typeof input === 'string' && input.length && input[0] === '['
? new URLIPv6Address (input)
: parseDomain && input.length
? parseDomainOrIPv4Address (input, percentCoded)
: input
}
function printHost (h, { unicode = true } = { }) {
return h && !unicode && (h instanceof URLDomainName) ?
h.toASCII () : String (h)
}
function isLocalHost (host) {
// REVIEW should this also work with opaque hosts?
return (typeof host === 'object' && host && host instanceof URLDomainName
&& _lc_equiv (host [$], 'localhost'))
}
function _lc_equiv (s1, s2) {
const len = s1.length
let r = len === s2.length
for (let i=0; r && i<len; i++)
r = (s1.charCodeAt(i) | 32) === (s2.charCodeAt(i) | 32)
return r
}
// IPv6 Addresses
// --------------
// Regex literal
// whitespace insignificant for readability
const rx = (...args) => {
const r = new RegExp (String.raw (...args) .replace (/\s/g, ''), 'y')
r.captures = []
return r
}
// Tokeniser states:
// { start, hex, decimal }
const _start = rx
` ([0-9]+)([.])
| ([0-9A-Fa-f]+)(:)?
| (::)`
const _hex = rx
` ([0-9]+)([.])
| ([0-9A-Fa-f]+)(:)?
| (:)`
const _dec = rx
`([0-9]+)([.])?`
const ipv6 = {
parse (input) {
if ('[' === input[0])
if (input[input.length-1] === ']')
input = input.substring (1, input.length-1)
else throw new SyntaxError (`Invalid use of IPv6 address delimiters: ${input}`)
const parts = []
const ip4 = []
let match, compress = null
let state = _start
let p = state.lastIndex = 0
while (match = state.exec (input)) {
p = state.lastIndex
if (match[1]) { // decimal number - ipv4 part
ip4.push (+match[1])
if (!match[2]) break // ipv4 dot separator `.`
else if (ip4.length === 4) throw new SyntaxError (`Invalid IPv6 address: ${input}`)
state = _dec }
else if (match[3]) { // hex number - ipv6 part
parts.push (parseInt (match[3], 16))
if (!match[4]) break // ipv6 separator `:`
state = _hex }
else if (match[5]) { // ipv6 compress `::` or separator `:`
if (compress == null) compress = parts.length
else throw new SyntaxError (`Invalid IPv6 address: ${input}`)
state = _hex }
state.lastIndex = p
}
if (p !== input.length || ip4.length && ip4.length !== 4)
throw new SyntaxError (`Invalid IPv6 address: ${input}`)
if (ip4.length) {
const [n1, n2, n3, n4] = ip4
parts.push (0x100 * n1 + n2, 0x100 * n3 + n4)
}
if (compress == null && parts.length !== 8)
throw new SyntaxError (`Invalid IPv6 address: ${input}`)
const a = [], l = 8 - parts.length
for (let i=0; i<l; i++) a.push (0)
return (parts.splice (compress, 0, ...a), parts)
},
print (parts) {
let [s0, l0] = [0, 0]
let [s1, l1] = [0, 0]
// Find the longest sequence of zeroes
for (let i=0, l=parts.length; i<l; i++) {
let num = parts[i]
if (num === 0) [s1, l1] = [i, 0]
while (num === 0 && i < l)
[i, l1, num] = [i+1, l1+1, parts[i+1]]
if (l1 > l0)
[s0, l0] = [s1, l1]
}
parts = parts.map (n => n.toString (16))
let c = s0 > 0 && s0 + l0 < 8 ? '' : s0 === 0 && l0 === 8 ? '::' : ':'
if (l0 > 1) parts.splice (s0, l0, c)
return parts.join (':')
},
normalise (input) {
return ipv6.print (ipv6.parse (input))
}
}
// Domains
// -------
const toUniodeOptions = {
checkBidi: true,
checkHyphens:false,
checkJoiners: true,
useSTD3ASCIIRules: false,
processingOption: "nontransitional",
}
const toASCIIOptions = {
checkBidi: true,
checkHyphens: false,
checkJoiners: true,
useSTD3ASCIIRules: false,
processingOption: 'nontransitional',
verifyDNSLength: false // REVIEW
}
const _isDomainString =
/^[^\x00-\x20\x7F#%/:<>?@[\\\]^|]+$/
const _endsInNumber =
/(^|[.])([0-9]+|0[xX][0-9A-Fa-f]*)[.]?$/
function parseDomainOrIPv4Address (input, percentCoded = true) {
let r = percentCoded ? pct.decode (input) : input
const { domain, error } = tr46.toUnicode (r, toUniodeOptions)
if (error) throw new Error (`parseDomainOrIPv4Address: The hostname <${input}> is not a valid encoded domain-name`)
try { return new URLIPv4Address (domain) } catch (e) {}
if (_endsInNumber.test (domain))
throw new Error (`parseDomainOrIPv4Address: The last domain-name--label in <${input}> must not be a numeric string`)
if (_isDomainString.test (domain))
return new URLDomainName (domain)
throw new Error (`parseDomain: The hostname <${input}> cannot be parsed as a domain-name, nor as an IPv4 address`)
}
// function domainToASCII (domain) {
// const domainString = domain [$]
// const ASCIIString = tr46.toASCII (domainString, toASCIIOptions)
// if (ASCIIString != null) return new URLDomainName (ASCIIString)
// else throw new Error (`domainToASCII: invalid domain ${domain}`)
// }
// An alternative option is to piggy-back on the URL constructor
// so as to avoid having to include the rather large tr46.
// The disadvantage of that is that then we lose the ability to print
// the full unicode version of the domain.
function domainToASCII_alt (domain) {
const domainString = domain[$]
if (domainString.length === 1 && domainString[0] === 'a')
return new URLDomainName ('a')
const u = new URL ('http://a/')
u.hostname = domainString
const ASCIIString = u.hostname
if (ASCIIString === 'a') throw new Error (`domainToASCII: invalid domain ${domainString}`)
else return new URLDomainName (ASCIIString)
}
// IPv4 Addresses
// --------------
const _ip4num = rx
`(?: 0[xX]([0-9A-Fa-f]*)
| (0[0-7]*)
| ([1-9][0-9]*)
)([.])?`
// match[1]: hex
// match[2]: octal
// match[3]: decimal
// match[4]: trailing dot
// This can also be done with a cute state machine.
// States: (s)tart (z)ero (o)ctal (d)ecimal (x)hex
// and epsilon/accepting states (Z,O,D,X) to accept
// them with trailing dot. This does accept 0x and 0x.
// which indeed the URL IPv4 Parser does too.
//
// s z o d x
// --========+
// _ x _ _ _ | xX
// z o o d x | 0
// d o o d x | 1-7
// d _ _ d x | 8-9
// _ _ _ _ x | A-Fa-f
// _ Z O D X | .
const ipv4 = {
parse (input) {
_ip4num.lastIndex = 0
let addr = 0, count = 0
let match, err = false
while ((match = _ip4num.exec (input))) {
count++
const num
= match[1] != null ? parseInt (match[1]||'0', 16) // hex
: match[2] != null ? parseInt (match[2], 8) // octal
: parseInt (match[3], 10) // decimal
if (_ip4num.lastIndex === input.length) {
const rest = 5 - count
if (err || (num >= 256**rest))
throw new RangeError (`Invalid IPv4 address: <${input}>`)
return ((addr << 8 * rest) + num) >>> 0 }
else {
if (count === 4 || !match[4]) return null
err = err || (num > 255)
addr = (addr << 8) + num
}
}
return null
},
print (num) {
let r = ''
for (let i=3; i; i--) r += ((num >> 8*i) & 255) + '.'
return r + (num & 255)
},
normalise (input) {
return ipv4.print (ipv4.parse (input))
}
}
// Exports
// =======
export {
parseAuth, parsePort,
URLIPv6Address, URLDomainName, URLIPv4Address,
parseDomainOrIPv4Address,
parseHost, printHost, isLocalHost,
ipv6, ipv4,
}