typesxml
Version:
Open source XML library written in TypeScript
560 lines • 27.2 kB
JavaScript
/*******************************************************************************
* Copyright (c) 2023-2026 Maxprograms.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 1.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/org/documents/epl-v10.html
*
* Contributors:
* Maxprograms - initial API and implementation
*******************************************************************************/
class CharClassItem {
isComplement;
content;
constructor(isComplement, content) {
this.isComplement = isComplement;
this.content = content;
}
}
export class XsdRegexTranslator {
// \i — NameStartChar (XML 1.0 Second Edition, Appendix B, productions [84][85][86])
static NAME_START_CHAR = ':A-Z_a-z' +
'\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF' +
'\u0100-\u0131\u0134-\u013E\u0141-\u0148\u014A-\u017E' +
'\u0180-\u01C3\u01CD-\u01F0\u01F4-\u01F5\u01FA-\u0217' +
'\u0250-\u02A8\u02BB-\u02C1' +
'\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE' +
'\u03D0-\u03D6\u03DA\u03DC\u03DE\u03E0\u03E2-\u03F3' +
'\u0401-\u040C\u040E-\u044F\u0451-\u045C\u045E-\u0481' +
'\u0490-\u04C4\u04C7-\u04C8\u04CB-\u04CC\u04D0-\u04EB' +
'\u04EE-\u04F5\u04F8-\u04F9' +
'\u0531-\u0556\u0559\u0561-\u0586' +
'\u05D0-\u05EA\u05F0-\u05F2' +
'\u0621-\u063A\u0641-\u064A' +
'\u0671-\u06B7\u06BA-\u06BE\u06C0-\u06CE\u06D0-\u06D3\u06D5\u06E5-\u06E6' +
'\u0905-\u0939\u093D\u0958-\u0961' +
'\u0985-\u098C\u098F-\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9' +
'\u09DC-\u09DD\u09DF-\u09E1\u09F0-\u09F1' +
'\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28\u0A2A-\u0A30' +
'\u0A32-\u0A33\u0A35-\u0A36\u0A38-\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74' +
'\u0A85-\u0A8B\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0' +
'\u0AB2-\u0AB3\u0AB5-\u0AB9\u0ABD\u0AE0' +
'\u0B05-\u0B0C\u0B0F-\u0B10\u0B13-\u0B28\u0B2A-\u0B30' +
'\u0B32-\u0B33\u0B36-\u0B39\u0B3D\u0B5C-\u0B5D\u0B5F-\u0B61' +
'\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99-\u0B9A\u0B9C' +
'\u0B9E-\u0B9F\u0BA3-\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9' +
'\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C60-\u0C61' +
'\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CDE\u0CE0-\u0CE1' +
'\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D60-\u0D61' +
'\u0E01-\u0E2E\u0E30\u0E32-\u0E33\u0E40-\u0E45' +
'\u0E81-\u0E82\u0E84\u0E87-\u0E88\u0E8A\u0E8D' +
'\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA-\u0EAB' +
'\u0EAD-\u0EAE\u0EB0\u0EB2-\u0EB3\u0EBD\u0EC0-\u0EC4' +
'\u0F40-\u0F47\u0F49-\u0F69' +
'\u10A0-\u10C5\u10D0-\u10F6\u1100\u1102-\u1103\u1105-\u1107\u1109' +
'\u110B-\u110C\u110E-\u1112\u113C\u113E\u1140\u114C\u114E\u1150' +
'\u1154-\u1155\u1159\u115F-\u1161\u1163\u1165\u1167\u1169\u116D-\u116E' +
'\u1172-\u1173\u1175\u119E\u11A8\u11AB\u11AE-\u11AF\u11B7-\u11B8\u11BA' +
'\u11BC-\u11C2\u11EB\u11F0\u11F9' +
'\u1E00-\u1E9B\u1EA0-\u1EF9' +
'\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57' +
'\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE' +
'\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC' +
'\u1FF2-\u1FF4\u1FF6-\u1FFC' +
'\u2126\u212A-\u212B\u212E\u2180-\u2182' +
'\u3041-\u3094\u30A1-\u30FA\u3105-\u312C\uAC00-\uD7A3' +
'\u4E00-\u9FA5\u3007\u3021-\u3029';
// \c — NameChar (XML 1.0 Second Edition, Appendix B, productions [4][87][88][89])
static NAME_CHAR = XsdRegexTranslator.NAME_START_CHAR +
'\\-\\.0-9' +
'\u0660-\u0669\u06F0-\u06F9\u0966-\u096F\u09E6-\u09EF' +
'\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE7-\u0BEF' +
'\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0E50-\u0E59' +
'\u0ED0-\u0ED9\u0F20-\u0F29' +
'\u0300-\u0345\u0360-\u0361\u0483-\u0486\u0591-\u05A1' +
'\u05A3-\u05B9\u05BB-\u05BD\u05BF\u05C1-\u05C2\u05C4' +
'\u064B-\u0652\u0670\u06D6-\u06DC\u06DD-\u06DF\u06E0-\u06E4' +
'\u06E7-\u06E8\u06EA-\u06ED\u0901-\u0903\u093C\u093E-\u094C' +
'\u094D\u0951-\u0954\u0962-\u0963\u0981-\u0983\u09BC\u09BE' +
'\u09BF\u09C0-\u09C4\u09C7-\u09C8\u09CB-\u09CD\u09D7\u09E2-\u09E3' +
'\u0A02\u0A3C\u0A3E\u0A3F\u0A40-\u0A42\u0A47-\u0A48\u0A4B-\u0A4D' +
'\u0A70-\u0A71\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD' +
'\u0B01-\u0B03\u0B3C\u0B3E-\u0B43\u0B47-\u0B48\u0B4B-\u0B4D' +
'\u0B56-\u0B57\u0B82-\u0B83\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD' +
'\u0BD7\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56' +
'\u0C82-\u0C83\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5-\u0CD6' +
'\u0D02-\u0D03\u0D3E-\u0D43\u0D46-\u0D48\u0D4A-\u0D4D\u0D57' +
'\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB-\u0EBC' +
'\u0EC8-\u0ECD\u0F18-\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F' +
'\u0F71-\u0F84\u0F86-\u0F8B\u0F90-\u0F95\u0F97\u0F99-\u0FAD' +
'\u0FB1-\u0FB7\u0FB9\u20D0-\u20DC\u20E1\u302A-\u302F\u3099\u309A' +
'\u00B7\u02D0\u02D1\u0387\u0640\u0E46\u0EC6\u3005' +
'\u3031-\u3035\u309D-\u309E\u30FC-\u30FE';
// XSD 1.0 \d is a fixed set of 20 decimal-digit ranges frozen at Unicode 3.1.
// Using \p{Nd} would reflect current Unicode, which diverges: e.g. U+0BE6 was added
// to Nd after Unicode 3.1, and U+1369-U+1371 were removed in Unicode 6.0.
static XSD_DIGITS = '\\u0030-\\u0039\\u0660-\\u0669\\u06F0-\\u06F9\\u0966-\\u096F' +
'\\u09E6-\\u09EF\\u0A66-\\u0A6F\\u0AE6-\\u0AEF\\u0B66-\\u0B6F' +
'\\u0BE7-\\u0BEF\\u0C66-\\u0C6F\\u0CE6-\\u0CEF\\u0D66-\\u0D6F' +
'\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F29\\u1040-\\u1049' +
'\\u1369-\\u1371\\u17E0-\\u17E9\\u1810-\\u1819\\uFF10-\\uFF19' +
'\\u{1D7CE}-\\u{1D7FF}';
// Characters assigned to L/M/N/S after Unicode 3.1 that must be excluded from \w.
// XSD 1.0 \w is defined against Unicode 3.1; these were Cn (unassigned) at that time.
static XSD_W_EXCLUDES = '\\u023F-\\u0240';
static CATEGORY_MAP = {
// Letter
L: '\\p{L}', Lu: '\\p{Lu}', Ll: '\\p{Ll}', Lt: '\\p{Lt}',
Lm: '\\p{Lm}', Lo: '\\p{Lo}',
// Mark
M: '\\p{M}', Mn: '\\p{Mn}', Mc: '\\p{Mc}', Me: '\\p{Me}',
// Number
N: '\\p{N}', Nd: '\\p{Nd}', Nl: '\\p{Nl}', No: '\\p{No}',
// Punctuation
P: '\\p{P}', Pc: '\\p{Pc}', Pd: '\\p{Pd}', Ps: '\\p{Ps}',
Pe: '\\p{Pe}', Pi: '\\p{Pi}', Pf: '\\p{Pf}', Po: '\\p{Po}',
// Symbol
S: '\\p{S}', Sm: '\\p{Sm}', Sc: '\\p{Sc}', Sk: '\\p{Sk}',
So: '\\p{So}',
// Separator
Z: '\\p{Z}', Zs: '\\p{Zs}', Zl: '\\p{Zl}', Zp: '\\p{Zp}',
// Other
C: '\\p{C}', Cc: '\\p{Cc}', Cf: '\\p{Cf}', Co: '\\p{Co}',
Cn: '\\p{Cn}',
};
static BLOCK_MAP = {
BasicLatin: '\u0000-\u007F',
Latin1Supplement: '\u0080-\u00FF',
'Latin-1Supplement': '\u0080-\u00FF',
LatinExtendedA: '\u0100-\u017F',
'LatinExtended-A': '\u0100-\u017F',
LatinExtendedB: '\u0180-\u024F',
'LatinExtended-B': '\u0180-\u024F',
IPAExtensions: '\u0250-\u02AF',
SpacingModifierLetters: '\u02B0-\u02FF',
CombiningDiacriticalMarks: '\u0300-\u036F',
Greek: '\u0370-\u03FF',
GreekandCoptic: '\u0370-\u03FF',
Cyrillic: '\u0400-\u04FF',
CyrillicSupplement: '\u0500-\u052F',
Armenian: '\u0530-\u058F',
Hebrew: '\u0590-\u05FF',
Arabic: '\u0600-\u06FF',
Syriac: '\u0700-\u074F',
Thaana: '\u0780-\u07BF',
Devanagari: '\u0900-\u097F',
Bengali: '\u0980-\u09FF',
Gurmukhi: '\u0A00-\u0A7F',
Gujarati: '\u0A80-\u0AFF',
Oriya: '\u0B00-\u0B7F',
Tamil: '\u0B80-\u0BFF',
Telugu: '\u0C00-\u0C7F',
Kannada: '\u0C80-\u0CFF',
Malayalam: '\u0D00-\u0D7F',
Sinhala: '\u0D80-\u0DFF',
Thai: '\u0E00-\u0E7F',
Lao: '\u0E80-\u0EFF',
Tibetan: '\u0F00-\u0FFF',
Myanmar: '\u1000-\u109F',
Georgian: '\u10A0-\u10FF',
HangulJamo: '\u1100-\u11FF',
Ethiopic: '\u1200-\u137F',
Cherokee: '\u13A0-\u13FF',
UnifiedCanadianAboriginalSyllabics: '\u1400-\u167F',
Ogham: '\u1680-\u169F',
Runic: '\u16A0-\u16FF',
Khmer: '\u1780-\u17FF',
Mongolian: '\u1800-\u18AF',
LatinExtendedAdditional: '\u1E00-\u1EFF',
GreekExtended: '\u1F00-\u1FFF',
GeneralPunctuation: '\u2000-\u206F',
SuperscriptsandSubscripts: '\u2070-\u209F',
CurrencySymbols: '\u20A0-\u20CF',
CombiningMarksforSymbols: '\u20D0-\u20FF',
LetterlikeSymbols: '\u2100-\u214F',
NumberForms: '\u2150-\u218F',
Arrows: '\u2190-\u21FF',
MathematicalOperators: '\u2200-\u22FF',
MiscellaneousTechnical: '\u2300-\u23FF',
ControlPictures: '\u2400-\u243F',
OpticalCharacterRecognition: '\u2440-\u245F',
EnclosedAlphanumerics: '\u2460-\u24FF',
BoxDrawing: '\u2500-\u257F',
BlockElements: '\u2580-\u259F',
GeometricShapes: '\u25A0-\u25FF',
MiscellaneousSymbols: '\u2600-\u26FF',
Dingbats: '\u2700-\u27BF',
BraillePatterns: '\u2800-\u28FF',
CJKRadicalsSupplement: '\u2E80-\u2EFF',
KangxiRadicals: '\u2F00-\u2FDF',
IdeographicDescriptionCharacters: '\u2FF0-\u2FFF',
CJKSymbolsandPunctuation: '\u3000-\u303F',
Hiragana: '\u3040-\u309F',
Katakana: '\u30A0-\u30FF',
Bopomofo: '\u3100-\u312F',
HangulCompatibilityJamo: '\u3130-\u318F',
Kanbun: '\u3190-\u319F',
BopomofoExtended: '\u31A0-\u31BF',
EnclosedCJKLettersandMonths: '\u3200-\u32FF',
CJKCompatibility: '\u3300-\u33FF',
CJKUnifiedIdeographsExtensionA: '\u3400-\u4DBF',
YijingHexagramSymbols: '\u4DC0-\u4DFF',
CJKUnifiedIdeographs: '\u4E00-\u9FFF',
YiSyllables: '\uA000-\uA48F',
YiRadicals: '\uA490-\uA4CF',
HangulSyllables: '\uAC00-\uD7AF',
HighSurrogates: '\uD800-\uDB7F',
HighPrivateUseSurrogates: '\uDB80-\uDBFF',
LowSurrogates: '\uDC00-\uDFFF',
PrivateUse: '\uE000-\uF8FF',
CJKCompatibilityIdeographs: '\uF900-\uFAFF',
AlphabeticPresentationForms: '\uFB00-\uFB4F',
ArabicPresentationFormsA: '\uFB50-\uFDFF',
'ArabicPresentationForms-A': '\uFB50-\uFDFF',
CombiningHalfMarks: '\uFE20-\uFE2F',
CJKCompatibilityForms: '\uFE30-\uFE4F',
SmallFormVariants: '\uFE50-\uFE6F',
ArabicPresentationFormsB: '\uFE70-\uFEFF',
'ArabicPresentationForms-B': '\uFE70-\uFEFF',
Specials: '\uFFF0-\uFFFF',
HalfwidthandFullwidthForms: '\uFF00-\uFFEF',
OldItalic: '\u{10300}-\u{1032F}',
Gothic: '\u{10330}-\u{1034F}',
Deseret: '\u{10400}-\u{1044F}',
ByzantineMusicalSymbols: '\u{1D000}-\u{1D0FF}',
MusicalSymbols: '\u{1D100}-\u{1D1FF}',
MathematicalAlphanumericSymbols: '\u{1D400}-\u{1D7FF}',
CJKUnifiedIdeographsExtensionB: '\u{20000}-\u{2A6DF}',
CJKCompatibilityIdeographsSupplement: '\u{2F800}-\u{2FA1F}',
Tags: '\u{E0000}-\u{E007F}',
};
static toRegExp(xsdPattern) {
const jsSource = XsdRegexTranslator.translate(xsdPattern);
return new RegExp('^(?:' + jsSource + ')$', 'u');
}
static translate(xsdPattern) {
return XsdRegexTranslator.parseExpression(xsdPattern, 0).result;
}
static parseExpression(src, start, stopAt // optional single character that ends the expression
) {
let out = '';
let i = start;
while (i < src.length) {
const ch = src[i];
// Stop character (used when parsing inside groups)
if (stopAt && ch === stopAt) {
break;
}
if (ch === '\\') {
const { result, end } = XsdRegexTranslator.parseEscape(src, i);
out += result;
i = end;
continue;
}
if (ch === '[') {
const { result, end } = XsdRegexTranslator.parseCharClass(src, i);
out += result;
i = end;
continue;
}
if (ch === '.') {
// XSD dot: any char except \n \r \x85 \u2028
out += '[^\\n\\r\\x85\\u2028]';
i++;
continue;
}
if (ch === '(') {
// Inline .NET flag groups: (?flags:...) where flags may include
// n (explicit capture — no-op for matching), i, m, s.
// Map (?n:...) → (?:...) since "n" only suppresses capture numbering.
// All other inline-flag prefixes are passed through as-is (JS supports them).
let prefix = '(';
let bodyStart = i + 1;
if (src[i + 1] === '?') {
const flagEnd = src.indexOf(':', i + 2);
if (flagEnd !== -1) {
const rawFlags = src.substring(i + 2, flagEnd);
const flags = rawFlags.startsWith('+') ? rawFlags.slice(1) : rawFlags;
if (/^[nimsx]+$/.test(flags)) {
const jsFlags = flags.replaceAll('n', '');
prefix = jsFlags.length > 0 ? '(?' + jsFlags + ':' : '(?:';
bodyStart = flagEnd + 1;
}
}
}
const inner = XsdRegexTranslator.parseExpression(src, bodyStart, ')');
if (src[inner.end] !== ')') {
throw new Error('XsdRegexTranslator: unmatched \'(\' at position ' + i);
}
out += prefix + inner.result + ')';
i = inner.end + 1;
continue;
}
// A '{' is a quantifier only when immediately followed by one or more digits.
// Otherwise it is a literal character; find the closing '}' and escape both.
if (ch === '{') {
if (i + 1 < src.length && src[i + 1] >= '0' && src[i + 1] <= '9') {
out += '{';
i++;
continue;
}
const closeIdx = src.indexOf('}', i + 1);
if (closeIdx !== -1) {
out += '\\{' + src.substring(i + 1, closeIdx) + '\\}';
i = closeIdx + 1;
}
else {
out += '\\{';
i++;
}
continue;
}
if (ch === ']') {
out += '\\x5D';
i++;
continue;
}
// Quantifiers, alternation, anchors — pass through as-is.
// XSD has no anchors, but the characters |, *, +, ?, } are
// the same as in JS.
out += ch;
i++;
}
return { result: out, end: i };
}
static parseEscape(src, i // points at the '\'
) {
const next = src[i + 1];
switch (next) {
// XSD-specific shorthand classes
case 'i': return { result: '[' + XsdRegexTranslator.NAME_START_CHAR + ']', end: i + 2 };
case 'I': return { result: '[^' + XsdRegexTranslator.NAME_START_CHAR + ']', end: i + 2 };
case 'c': return { result: '[' + XsdRegexTranslator.NAME_CHAR + ']', end: i + 2 };
case 'C': return { result: '[^' + XsdRegexTranslator.NAME_CHAR + ']', end: i + 2 };
// XSD \s is narrower than JS \s — only U+0020, \t, \n, \r
case 's': return { result: '[\\x20\\t\\n\\r]', end: i + 2 };
case 'S': return { result: '[^\\x20\\t\\n\\r]', end: i + 2 };
case 'd': return { result: '[' + XsdRegexTranslator.XSD_DIGITS + ']', end: i + 2 };
case 'D': return { result: '[^' + XsdRegexTranslator.XSD_DIGITS + ']', end: i + 2 };
// XSD \w excludes the characters that \i and \c cover;
// per spec it is [#x0000-#x10FFFF]-[\p{P}\p{Z}\p{C}] which is
// equivalent to [\p{L}\p{M}\p{N}\p{S}]
case 'w': return { result: '(?:(?![' + XsdRegexTranslator.XSD_W_EXCLUDES + '])[\\p{L}\\p{M}\\p{N}\\p{S}])', end: i + 2 };
case 'W': return { result: '(?:[^\\p{L}\\p{M}\\p{N}\\p{S}]|[' + XsdRegexTranslator.XSD_W_EXCLUDES + '])', end: i + 2 };
// Unicode category / block escapes
case 'p': {
const { name, end } = XsdRegexTranslator.readBracedName(src, i + 2);
return { result: XsdRegexTranslator.translateCategory(name, false), end };
}
case 'P': {
const { name, end } = XsdRegexTranslator.readBracedName(src, i + 2);
return { result: XsdRegexTranslator.translateCategory(name, true), end };
}
// \A, \Z and \z are Perl/PCRE string-boundary anchors; toRegExp already
// wraps the pattern with ^ and $ so all three are no-ops here.
case 'A': return { result: '', end: i + 2 };
case 'Z': return { result: '', end: i + 2 };
case 'z': return { result: '', end: i + 2 };
// \- is a valid XSD identity escape but not in JS u-mode outside
// a character class; map it to \x2D (literal hyphen).
case '-': return { result: '\\x2D', end: i + 2 };
// Everything else (including \n \r \t \\ \. etc.) is passed
// through unchanged — JS understands them identically.
default: {
if (next >= '0' && next <= '7') {
let j = i + 1;
let octalStr = '';
while (j < src.length && j < i + 4 && src[j] >= '0' && src[j] <= '7') {
octalStr += src[j];
j++;
}
const code = Number.parseInt(octalStr, 8);
const hex = code <= 0xFF
? '\\x' + code.toString(16).padStart(2, '0')
: '\\u' + code.toString(16).padStart(4, '0');
return { result: hex, end: j };
}
return { result: '\\' + next, end: i + 2 };
}
}
}
static parseCharClass(src, start // points at the opening '['
) {
let i = start + 1;
const negate = src[i] === '^';
if (negate)
i++;
if (src[i] === ']') {
throw new Error('XsdRegexTranslator: empty character class at position ' + start);
}
// First pass: collect items as a typed list
const items = [];
let subtracted = null;
while (i < src.length && src[i] !== ']') {
// Detect -[ at current position: class subtraction
if (src[i] === '-' && src[i + 1] === '[') {
const inner = XsdRegexTranslator.parseCharClass(src, i + 1);
subtracted = inner.result;
i = inner.end;
break;
}
if (src[i] === '[') {
if (src[i + 1] === ':') {
throw new Error('XsdRegexTranslator: POSIX character class not supported at position ' + i);
}
items.push(new CharClassItem(false, '\\x5B'));
i++;
continue;
}
if (src[i] === '\\') {
const esc = XsdRegexTranslator.parseEscapeInsideClass(src, i);
items.push(esc.item);
i = esc.end;
}
else {
items.push(new CharClassItem(false, src[i]));
i++;
}
}
// Consume closing ']'
if (src[i] !== ']') {
throw new Error('XsdRegexTranslator: unterminated character class at position ' + start);
}
i++;
const baseExpr = XsdRegexTranslator.emitCharClass(items, negate);
if (subtracted === null) {
return { result: baseExpr, end: i };
}
// Class subtraction: [base-[sub]]
// JS has no native subtraction syntax, so we implement it via a
// lookahead: (?![sub])[base] — but that only works outside a class.
// We therefore convert to: (?:(?!subtracted)[base])
// which is semantically equivalent to one code-point matching.
return {
result: '(?:(?!' + subtracted + ')' + baseExpr + ')',
end: i,
};
}
static emitCharClass(items, negate) {
const posContent = items.filter(it => !it.isComplement).map(it => it.content).join('');
const compContents = items.filter(it => it.isComplement).map(it => it.content);
if (compContents.length === 0) {
return '[' + (negate ? '^' : '') + posContent + ']';
}
if (posContent === '' && compContents.length === 1) {
return negate
? '[' + compContents[0] + ']'
: '[^' + compContents[0] + ']';
}
if (posContent === '') {
if (negate) {
// Intersection of complement bases: (?=[c1])(?=[c2])...[cN]
let result = '';
for (let k = 0; k < compContents.length - 1; k++) {
result += '(?=[' + compContents[k] + '])';
}
return result + '[' + compContents[compContents.length - 1] + ']';
}
// Union of negated: (?:[^c1]|[^c2]|...)
return '(?:' + compContents.map(c => '[^' + c + ']').join('|') + ')';
}
if (negate) {
// ¬(P ∪ ¬C) = ¬P ∩ C → lookaheads for each C, then [^P]
const lookaheads = compContents.map(c => '(?=[' + c + '])').join('');
return lookaheads + '[^' + posContent + ']';
}
// P ∪ ¬C → (?:[P]|[^c1]|[^c2]|...)
const parts = ['[' + posContent + ']'];
compContents.forEach(c => parts.push('[^' + c + ']'));
return '(?:' + parts.join('|') + ')';
}
static parseEscapeInsideClass(src, i) {
const next = src[i + 1];
switch (next) {
case 'i': return { item: new CharClassItem(false, XsdRegexTranslator.NAME_START_CHAR), end: i + 2 };
case 'I': return { item: new CharClassItem(true, XsdRegexTranslator.NAME_START_CHAR), end: i + 2 };
case 'c': return { item: new CharClassItem(false, XsdRegexTranslator.NAME_CHAR), end: i + 2 };
case 'C': return { item: new CharClassItem(true, XsdRegexTranslator.NAME_CHAR), end: i + 2 };
case 's': return { item: new CharClassItem(false, '\\x20\\t\\n\\r'), end: i + 2 };
case 'S': return { item: new CharClassItem(true, '\\x20\\t\\n\\r'), end: i + 2 };
case 'd': return { item: new CharClassItem(false, XsdRegexTranslator.XSD_DIGITS), end: i + 2 };
case 'D': return { item: new CharClassItem(true, XsdRegexTranslator.XSD_DIGITS), end: i + 2 };
case 'w': return { item: new CharClassItem(false, '\\p{L}\\p{M}\\p{N}\\p{S}'), end: i + 2 };
case 'W': return { item: new CharClassItem(true, '\\p{L}\\p{M}\\p{N}\\p{S}'), end: i + 2 };
case 'p': {
const { name, end } = XsdRegexTranslator.readBracedName(src, i + 2);
return { item: new CharClassItem(false, XsdRegexTranslator.resolveClassContent(name)), end };
}
case 'P': {
const { name, end } = XsdRegexTranslator.readBracedName(src, i + 2);
return { item: new CharClassItem(true, XsdRegexTranslator.resolveClassContent(name)), end };
}
case 'A': return { item: new CharClassItem(false, ''), end: i + 2 };
case 'Z': return { item: new CharClassItem(false, ''), end: i + 2 };
case 'z': return { item: new CharClassItem(false, ''), end: i + 2 };
default: {
if (next >= '0' && next <= '7') {
let j = i + 1;
let octalStr = '';
while (j < src.length && j < i + 4 && src[j] >= '0' && src[j] <= '7') {
octalStr += src[j];
j++;
}
const code = Number.parseInt(octalStr, 8);
const hex = code <= 0xFF
? '\\x' + code.toString(16).padStart(2, '0')
: '\\u' + code.toString(16).padStart(4, '0');
return { item: new CharClassItem(false, hex), end: j };
}
return { item: new CharClassItem(false, '\\' + next), end: i + 2 };
}
}
}
static translateCategory(name, negate) {
// Block escape: \p{IsXxx}
if (name.startsWith('Is')) {
const blockName = name.slice(2);
const range = XsdRegexTranslator.BLOCK_MAP[blockName]
?? XsdRegexTranslator.BLOCK_MAP[blockName.replaceAll('-', '')];
if (range) {
return negate ? '[^' + range + ']' : '[' + range + ']';
}
throw new Error('XsdRegexTranslator: unknown Unicode block \'' + name + '\'');
}
// Category escape: must be in CATEGORY_MAP
const mapped = XsdRegexTranslator.CATEGORY_MAP[name];
if (mapped) {
return negate ? mapped.replace('\\p{', '\\P{') : mapped;
}
throw new Error('XsdRegexTranslator: unknown Unicode category \'' + name + '\'');
}
static resolveClassContent(name) {
if (name.startsWith('Is')) {
const blockName = name.slice(2);
const range = XsdRegexTranslator.BLOCK_MAP[blockName]
?? XsdRegexTranslator.BLOCK_MAP[blockName.replaceAll('-', '')];
if (range) {
return range;
}
throw new Error('XsdRegexTranslator: unknown Unicode block \'' + name + '\'');
}
const mapped = XsdRegexTranslator.CATEGORY_MAP[name];
if (mapped) {
return mapped;
}
throw new Error('XsdRegexTranslator: unknown Unicode category \'' + name + '\'');
}
static readBracedName(src, i // should point at '{'
) {
if (src[i] !== '{') {
throw new Error('XsdRegexTranslator: expected \'{\' after \\p/\\P at position ' + i);
}
const close = src.indexOf('}', i + 1);
if (close === -1) {
throw new Error('XsdRegexTranslator: unterminated \\p{...} starting at position ' + i);
}
return { name: src.slice(i + 1, close), end: close + 1 };
}
}
//# sourceMappingURL=XsdRegexTranslator.js.map