UNPKG

spec-url

Version:

URL library that implements a reference resolution algorithm for WHATWG URLs

github.com/alwinb/spec-url

alwinb/spec-url

268 lines (215 loc) • 7.26 kB

JavaScript

const log = console.log.bind (console) const { assign } = Object const intsInto = (map, i = 0) => new Proxy ({}, { get:($,k) => (map [k] = i, i++) }) function* range (a, z = Infinity) { while (a <= z) yield a++ } // Component Characters // ==================== // Components are validated and normalised according to their component type. // Depending on the component type, code-points may be: // V = 0 0 0 0 0 // Valid, ASCII // E = 0 0 0 0 1 // Valid, escaped // U = 0 0 0 1 0 // Valid, unicode // T = 0 0 1 0 0 // Invalid // F = 0 0 1 1 1 // Invalid, escaped // R = 0 1 1 1 1 // Invalid, escaped and/or rejected // I = 1 0 1 0 1 // Invalid, escaped and/or skipped // --|---------- // // | s r w u e // skip, reject, warn, valid-unicode, escape // (The bit patterns used in the categorisation are (so far) // merely an implementation detail used in the encoding algorithm) const V = 0b00000 const E = 0b00001 const U = 0b00010 const T = 0b00100 const F = 0b00111 const R = 0b01111 const I = 0b10101 // Rather than specifying this per individual codepoint, we partition // the set of all codepoints into non-overlapping equivalence classes // that are subdivisions of gen-delims -- excluding [ and ], // sub-delims, unreserved ASCII, controls, other unicode, and the // percent character. // The following table maps ascii + c1-control code points to their // character class id; Higher code points are handled in code. const _eqs = '\ moooooooonnoonoo\ oooooooooooooooo\ pjtfjgjhjjjjjkkc\ kkkkkkkkkkairire\ bkkkkkkkkkkkkkkk\ kkkkkkkkkkkqdqvk\ ukkkkkkkkkkkkkkk\ kkkkkkkkkkksqsko\ oooooooooooooooo\ oooooooooooooooo' const cctable = _eqs.split ('') .map (n => n.charCodeAt(0) - 'a'.charCodeAt (0)) // Naming a mere few of all of the classes: const cc_pct = 6 // g const cc_unreserved = 10 // k const cc_other_unicode = 11 // l const cc_control = 14 // o // We specify per component type, per class, how the // code point should be validated and/or normalised. // Action table const _ = V const whatwg = new Uint8Array ([ //0, 1, 2, 3, 4, 5, 6 //------------ gen-delims ---- F, F, _, _, _, _, _, // a) : F, R, _, _, _, _, _, // b) @ F, F, F, T, _, _, _, // c) / F, R, T, T, T, T, T, // d) \ F, F, F, F, _, _, _, // e) ? F, F, F, F, F, F, T, // f) # //-------------- percent --- E, E, E, E, E, E, E, // g) % // Unless incremental=true //------------ sub-delims -- _, _, _, _, _, E, _, // h) ' E, _, _, _, _, _, _, // i) ; = _, _, _, _, _, _, _, // j) ! $ & ( ) * + , //------------ unreserved ---- _, _, _, _, _, _, _, // k) alhpa digit - . _ ~ U, U, U, U, U, U, U, // l) other unicode //--------- non-component ---- F, R, F, F, F, F, F, // m) nul I, I, I, I, I, I, I, // n) HT, LF, CR F, F, F, F, F, F, F, // o) control F, R, F, T, F, F, F, // p) SP F, R, T, T, T, T, T, // q) [ ] | F, R, F, T, F, F, F, // r) < > F, T, F, T, T, T, T, // s) { } F, T, F, T, F, F, F, // t) " F, T, F, T, T, T, F, // u) ` F, R, F, T, T, T, T, // v) ^ // --------------------------- //0, 1, 2, 3, 4, 5, 6 ]) // The rows correspond to character classes. // The columns correspond to encode/ action sets: const encodeSets = { userInfo: 0, user:0, pass:0, opaqueHost: 1, host: 1, pathSegment: 2, dir:2, file:2, opaquePath: 3, query: 4, specialQuery: 5, fragment: 6, hash:6, } const descriptiveNames = [ 'userinfo', 'opaque hostname', 'path segment', 'opaque path', 'query', 'special query', 'fragment', ] // Percent Coding // -------------- // Settings const _FailStrict = 0b11100 const _FailSome = 0b01000 const _FailNone = 0 const _shouldSkip = 0b10000 function percentEncode (value, encodeSet, _options = {}) { if (typeof encodeSet !== 'number') throw new Error ('percentEncode: Invalid encodeSet Id') const { incremental = true, unicode = true, strict = false, fixup = true } = _options const escapeSetting = unicode ? 1 : 0b11 const strictnessSetting = _FailSome const out = [] let anchor = 0, pos = 0, end = pos let flags = 0 for (let c of value) { let cp = c.codePointAt (0) // JS specific - replace unmatched surrogates with u+FFFD if (cp >> 11 === 0x1B) cp = 0xFFFD // Is that the same? // Determine the character class const cc = cp === 0x25 ? (incremental ? cc_unreserved : cc_pct) : cp < 0xA0 ? cctable[cp] // Lookup table for ASCII-C1 : (cp >> 11 === 0x1B) // surrogate || 0xFDD0 <= cp && cp <= 0xFDEF // non-char || ((cp >> 1) & 0x7FFF) === 0x7FFF // non-char ? cc_control : cc_other_unicode // look up the action/flags in the actions table const action = whatwg[cc * 7 + encodeSet] flags |= action if (action & _shouldSkip) continue if (action & escapeSetting) for (let byte of utf8.encode (cp)) { let h1 = byte >> 4 let h2 = byte & 0b1111 h1 = (h1 < 10 ? 48 : 55) + h1 // single hex digit h2 = (h2 < 10 ? 48 : 55) + h2 // single hex digit out.push (0x25, h1, h2) // %xx code } else out.push (cp) } if (flags & strictnessSetting) { throw new Error (`Rejected codepoints in ${descriptiveNames[encodeSet]}`) } return String.fromCodePoint (...out) } // Percent Coding // -------------- const S = encodeSets const pct = { encode: percentEncode, decode (input) { return input.replace (_pcts, _decode) } } // UTF8 Coding // ----------- const [h2, h3, h4, h5] = [ 0b10<<6, 0b110<<5, 0b1110<<4, 0b11110<<3 ] const [t6, t5, t4, t3] = [ ~(-1<<6), ~(-1<<5), ~(-1<<4), ~(-1<<3) ] const utf8 = { encode (code) { if (code < 0x80) return [code] else if (code < 0x800) { const [c1, c2] = [code >> 6, code & t6] return [h3|(t5 & c1), h2|(t6 & c2)] } else if (code < 0x10000) { const [c1, c2, c3] = [code >> 12, code >> 6, code & t6] return [h4|(t4 & c1), h2|(t6 & c2), h2|(t6 & c3)] } else { const [c1, c2, c3, c4] = [code >> 18, code >> 12, code >> 6, code & t6] return [h5|(t3 & c1), h2|(t6 & c2), h2|(t6 & c3), h2|(t6 & c4)] } }, decode (bytes) { const codes = [] let n = 0, code = 0, err = false for (let i=0,l=bytes.length; i<l; i++) { const b = bytes[i] ;[err, n, code] = b >= 0xf8 ? [ 1, 0, 0 ] : b >= 0xf0 ? [ n, 3, b & 7 ] : b >= 0xe0 ? [ n, 2, b & 15 ] : b >= 0xc0 ? [ n, 1, b & 31 ] : b >= 0x80 ? [ !n, n-1, code<<6 | b & 63 ] : [ n, 0, b ] if (err) throw new Error (`Invalid UTF8, at index ${i}`) if (n === 0) codes [codes.length] = code // TODO code must be <= 0x10FFFF // and err on overlong encodings too } if (n) throw new Error (`Incomplete UTF8 byte sequence`) return codes } } // private const _pcts = /(%[0-9A-Fa-f]{2})+/g const _decode = input => { const bytes = [] for (let i=0, l = input.length; i<l; i+=3) bytes[bytes.length] = parseInt (input.substr (i+1, 2), 16) return String.fromCodePoint (... utf8.decode (bytes)) } // Exports // ------- const _private = { whatwg, encodeSets, descriptiveNames } export { pct, utf8, encodeSets as _encodeSets, _private }