cdk-s3-vectors
Version:
A CDK construct library for Amazon S3 Vectors. This construct simplifies the creation of vector buckets, vector indexes with full configuration options, and Amazon Bedrock knowledge bases using S3 Vectors as the underlying vector store.
607 lines (550 loc) • 21.1 kB
JavaScript
'use strict';
///@ts-check
import {getAllMatches, isExist} from '../util.js';
import xmlNode from './xmlNode.js';
import readDocType from './DocTypeReader.js';
import toNumber from "strnum";
import getIgnoreAttributesFn from "../ignoreAttributes.js";
// const regx =
// '<((!\\[CDATA\\[([\\s\\S]*?)(]]>))|((NAME:)?(NAME))([^>]*)>|((\\/)(NAME)\\s*>))([^<]*)'
// .replace(/NAME/g, util.nameRegexp);
//const tagsRegx = new RegExp("<(\\/?[\\w:\\-\._]+)([^>]*)>(\\s*"+cdataRegx+")*([^<]+)?","g");
//const tagsRegx = new RegExp("<(\\/?)((\\w*:)?([\\w:\\-\._]+))([^>]*)>([^<]*)("+cdataRegx+"([^<]*))*([^<]+)?","g");
export default class OrderedObjParser{
constructor(options){
this.options = options;
this.currentNode = null;
this.tagsNodeStack = [];
this.docTypeEntities = {};
this.lastEntities = {
"apos" : { regex: /&(apos|#39|#x27);/g, val : "'"},
"gt" : { regex: /&(gt|#62|#x3E);/g, val : ">"},
"lt" : { regex: /&(lt|#60|#x3C);/g, val : "<"},
"quot" : { regex: /&(quot|#34|#x22);/g, val : "\""},
};
this.ampEntity = { regex: /&(amp|#38|#x26);/g, val : "&"};
this.htmlEntities = {
"space": { regex: /&(nbsp|#160);/g, val: " " },
// "lt" : { regex: /&(lt|#60);/g, val: "<" },
// "gt" : { regex: /&(gt|#62);/g, val: ">" },
// "amp" : { regex: /&(amp|#38);/g, val: "&" },
// "quot" : { regex: /&(quot|#34);/g, val: "\"" },
// "apos" : { regex: /&(apos|#39);/g, val: "'" },
"cent" : { regex: /&(cent|#162);/g, val: "¢" },
"pound" : { regex: /&(pound|#163);/g, val: "£" },
"yen" : { regex: /&(yen|#165);/g, val: "¥" },
"euro" : { regex: /&(euro|#8364);/g, val: "€" },
"copyright" : { regex: /&(copy|#169);/g, val: "©" },
"reg" : { regex: /&(reg|#174);/g, val: "®" },
"inr" : { regex: /&(inr|#8377);/g, val: "₹" },
"num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCodePoint(Number.parseInt(str, 10)) },
"num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCodePoint(Number.parseInt(str, 16)) },
};
this.addExternalEntities = addExternalEntities;
this.parseXml = parseXml;
this.parseTextData = parseTextData;
this.resolveNameSpace = resolveNameSpace;
this.buildAttributesMap = buildAttributesMap;
this.isItStopNode = isItStopNode;
this.replaceEntitiesValue = replaceEntitiesValue;
this.readStopNodeData = readStopNodeData;
this.saveTextToParentTag = saveTextToParentTag;
this.addChild = addChild;
this.ignoreAttributesFn = getIgnoreAttributesFn(this.options.ignoreAttributes)
}
}
function addExternalEntities(externalEntities){
const entKeys = Object.keys(externalEntities);
for (let i = 0; i < entKeys.length; i++) {
const ent = entKeys[i];
this.lastEntities[ent] = {
regex: new RegExp("&"+ent+";","g"),
val : externalEntities[ent]
}
}
}
/**
* @param {string} val
* @param {string} tagName
* @param {string} jPath
* @param {boolean} dontTrim
* @param {boolean} hasAttributes
* @param {boolean} isLeafNode
* @param {boolean} escapeEntities
*/
function parseTextData(val, tagName, jPath, dontTrim, hasAttributes, isLeafNode, escapeEntities) {
if (val !== undefined) {
if (this.options.trimValues && !dontTrim) {
val = val.trim();
}
if(val.length > 0){
if(!escapeEntities) val = this.replaceEntitiesValue(val);
const newval = this.options.tagValueProcessor(tagName, val, jPath, hasAttributes, isLeafNode);
if(newval === null || newval === undefined){
//don't parse
return val;
}else if(typeof newval !== typeof val || newval !== val){
//overwrite
return newval;
}else if(this.options.trimValues){
return parseValue(val, this.options.parseTagValue, this.options.numberParseOptions);
}else{
const trimmedVal = val.trim();
if(trimmedVal === val){
return parseValue(val, this.options.parseTagValue, this.options.numberParseOptions);
}else{
return val;
}
}
}
}
}
function resolveNameSpace(tagname) {
if (this.options.removeNSPrefix) {
const tags = tagname.split(':');
const prefix = tagname.charAt(0) === '/' ? '/' : '';
if (tags[0] === 'xmlns') {
return '';
}
if (tags.length === 2) {
tagname = prefix + tags[1];
}
}
return tagname;
}
//TODO: change regex to capture NS
//const attrsRegx = new RegExp("([\\w\\-\\.\\:]+)\\s*=\\s*(['\"])((.|\n)*?)\\2","gm");
const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm');
function buildAttributesMap(attrStr, jPath, tagName) {
if (this.options.ignoreAttributes !== true && typeof attrStr === 'string') {
// attrStr = attrStr.replace(/\r?\n/g, ' ');
//attrStr = attrStr || attrStr.trim();
const matches = getAllMatches(attrStr, attrsRegx);
const len = matches.length; //don't make it inline
const attrs = {};
for (let i = 0; i < len; i++) {
const attrName = this.resolveNameSpace(matches[i][1]);
if (this.ignoreAttributesFn(attrName, jPath)) {
continue
}
let oldVal = matches[i][4];
let aName = this.options.attributeNamePrefix + attrName;
if (attrName.length) {
if (this.options.transformAttributeName) {
aName = this.options.transformAttributeName(aName);
}
if(aName === "__proto__") aName = "#__proto__";
if (oldVal !== undefined) {
if (this.options.trimValues) {
oldVal = oldVal.trim();
}
oldVal = this.replaceEntitiesValue(oldVal);
const newVal = this.options.attributeValueProcessor(attrName, oldVal, jPath);
if(newVal === null || newVal === undefined){
//don't parse
attrs[aName] = oldVal;
}else if(typeof newVal !== typeof oldVal || newVal !== oldVal){
//overwrite
attrs[aName] = newVal;
}else{
//parse
attrs[aName] = parseValue(
oldVal,
this.options.parseAttributeValue,
this.options.numberParseOptions
);
}
} else if (this.options.allowBooleanAttributes) {
attrs[aName] = true;
}
}
}
if (!Object.keys(attrs).length) {
return;
}
if (this.options.attributesGroupName) {
const attrCollection = {};
attrCollection[this.options.attributesGroupName] = attrs;
return attrCollection;
}
return attrs
}
}
const parseXml = function(xmlData) {
xmlData = xmlData.replace(/\r\n?/g, "\n"); //TODO: remove this line
const xmlObj = new xmlNode('!xml');
let currentNode = xmlObj;
let textData = "";
let jPath = "";
for(let i=0; i< xmlData.length; i++){//for each char in XML data
const ch = xmlData[i];
if(ch === '<'){
// const nextIndex = i+1;
// const _2ndChar = xmlData[nextIndex];
if( xmlData[i+1] === '/') {//Closing Tag
const closeIndex = findClosingIndex(xmlData, ">", i, "Closing Tag is not closed.")
let tagName = xmlData.substring(i+2,closeIndex).trim();
if(this.options.removeNSPrefix){
const colonIndex = tagName.indexOf(":");
if(colonIndex !== -1){
tagName = tagName.substr(colonIndex+1);
}
}
if(this.options.transformTagName) {
tagName = this.options.transformTagName(tagName);
}
if(currentNode){
textData = this.saveTextToParentTag(textData, currentNode, jPath);
}
//check if last tag of nested tag was unpaired tag
const lastTagName = jPath.substring(jPath.lastIndexOf(".")+1);
if(tagName && this.options.unpairedTags.indexOf(tagName) !== -1 ){
throw new Error(`Unpaired tag can not be used as closing tag: </${tagName}>`);
}
let propIndex = 0
if(lastTagName && this.options.unpairedTags.indexOf(lastTagName) !== -1 ){
propIndex = jPath.lastIndexOf('.', jPath.lastIndexOf('.')-1)
this.tagsNodeStack.pop();
}else{
propIndex = jPath.lastIndexOf(".");
}
jPath = jPath.substring(0, propIndex);
currentNode = this.tagsNodeStack.pop();//avoid recursion, set the parent tag scope
textData = "";
i = closeIndex;
} else if( xmlData[i+1] === '?') {
let tagData = readTagExp(xmlData,i, false, "?>");
if(!tagData) throw new Error("Pi Tag is not closed.");
textData = this.saveTextToParentTag(textData, currentNode, jPath);
if( (this.options.ignoreDeclaration && tagData.tagName === "?xml") || this.options.ignorePiTags){
}else{
const childNode = new xmlNode(tagData.tagName);
childNode.add(this.options.textNodeName, "");
if(tagData.tagName !== tagData.tagExp && tagData.attrExpPresent){
childNode[":@"] = this.buildAttributesMap(tagData.tagExp, jPath, tagData.tagName);
}
this.addChild(currentNode, childNode, jPath, i);
}
i = tagData.closeIndex + 1;
} else if(xmlData.substr(i + 1, 3) === '!--') {
const endIndex = findClosingIndex(xmlData, "-->", i+4, "Comment is not closed.")
if(this.options.commentPropName){
const comment = xmlData.substring(i + 4, endIndex - 2);
textData = this.saveTextToParentTag(textData, currentNode, jPath);
currentNode.add(this.options.commentPropName, [ { [this.options.textNodeName] : comment } ]);
}
i = endIndex;
} else if( xmlData.substr(i + 1, 2) === '!D') {
const result = readDocType(xmlData, i);
this.docTypeEntities = result.entities;
i = result.i;
}else if(xmlData.substr(i + 1, 2) === '![') {
const closeIndex = findClosingIndex(xmlData, "]]>", i, "CDATA is not closed.") - 2;
const tagExp = xmlData.substring(i + 9,closeIndex);
textData = this.saveTextToParentTag(textData, currentNode, jPath);
let val = this.parseTextData(tagExp, currentNode.tagname, jPath, true, false, true, true);
if(val == undefined) val = "";
//cdata should be set even if it is 0 length string
if(this.options.cdataPropName){
currentNode.add(this.options.cdataPropName, [ { [this.options.textNodeName] : tagExp } ]);
}else{
currentNode.add(this.options.textNodeName, val);
}
i = closeIndex + 2;
}else {//Opening tag
let result = readTagExp(xmlData,i, this.options.removeNSPrefix);
let tagName= result.tagName;
const rawTagName = result.rawTagName;
let tagExp = result.tagExp;
let attrExpPresent = result.attrExpPresent;
let closeIndex = result.closeIndex;
if (this.options.transformTagName) {
tagName = this.options.transformTagName(tagName);
}
//save text as child node
if (currentNode && textData) {
if(currentNode.tagname !== '!xml'){
//when nested tag is found
textData = this.saveTextToParentTag(textData, currentNode, jPath, false);
}
}
//check if last tag was unpaired tag
const lastTag = currentNode;
if(lastTag && this.options.unpairedTags.indexOf(lastTag.tagname) !== -1 ){
currentNode = this.tagsNodeStack.pop();
jPath = jPath.substring(0, jPath.lastIndexOf("."));
}
if(tagName !== xmlObj.tagname){
jPath += jPath ? "." + tagName : tagName;
}
const startIndex = i;
if (this.isItStopNode(this.options.stopNodes, jPath, tagName)) {
let tagContent = "";
//self-closing tag
if(tagExp.length > 0 && tagExp.lastIndexOf("/") === tagExp.length - 1){
if(tagName[tagName.length - 1] === "/"){ //remove trailing '/'
tagName = tagName.substr(0, tagName.length - 1);
jPath = jPath.substr(0, jPath.length - 1);
tagExp = tagName;
}else{
tagExp = tagExp.substr(0, tagExp.length - 1);
}
i = result.closeIndex;
}
//unpaired tag
else if(this.options.unpairedTags.indexOf(tagName) !== -1){
i = result.closeIndex;
}
//normal tag
else{
//read until closing tag is found
const result = this.readStopNodeData(xmlData, rawTagName, closeIndex + 1);
if(!result) throw new Error(`Unexpected end of ${rawTagName}`);
i = result.i;
tagContent = result.tagContent;
}
const childNode = new xmlNode(tagName);
if(tagName !== tagExp && attrExpPresent){
childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName);
}
if(tagContent) {
tagContent = this.parseTextData(tagContent, tagName, jPath, true, attrExpPresent, true, true);
}
jPath = jPath.substr(0, jPath.lastIndexOf("."));
childNode.add(this.options.textNodeName, tagContent);
this.addChild(currentNode, childNode, jPath, startIndex);
}else{
//selfClosing tag
if(tagExp.length > 0 && tagExp.lastIndexOf("/") === tagExp.length - 1){
if(tagName[tagName.length - 1] === "/"){ //remove trailing '/'
tagName = tagName.substr(0, tagName.length - 1);
jPath = jPath.substr(0, jPath.length - 1);
tagExp = tagName;
}else{
tagExp = tagExp.substr(0, tagExp.length - 1);
}
if(this.options.transformTagName) {
tagName = this.options.transformTagName(tagName);
}
const childNode = new xmlNode(tagName);
if(tagName !== tagExp && attrExpPresent){
childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName);
}
this.addChild(currentNode, childNode, jPath, startIndex);
jPath = jPath.substr(0, jPath.lastIndexOf("."));
}
//opening tag
else{
const childNode = new xmlNode( tagName);
this.tagsNodeStack.push(currentNode);
if(tagName !== tagExp && attrExpPresent){
childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName);
}
this.addChild(currentNode, childNode, jPath, startIndex);
currentNode = childNode;
}
textData = "";
i = closeIndex;
}
}
}else{
textData += xmlData[i];
}
}
return xmlObj.child;
}
function addChild(currentNode, childNode, jPath, startIndex){
// unset startIndex if not requested
if (!this.options.captureMetaData) startIndex = undefined;
const result = this.options.updateTag(childNode.tagname, jPath, childNode[":@"])
if(result === false){
} else if(typeof result === "string"){
childNode.tagname = result
currentNode.addChild(childNode, startIndex);
}else{
currentNode.addChild(childNode, startIndex);
}
}
const replaceEntitiesValue = function(val){
if(this.options.processEntities){
for(let entityName in this.docTypeEntities){
const entity = this.docTypeEntities[entityName];
val = val.replace( entity.regx, entity.val);
}
for(let entityName in this.lastEntities){
const entity = this.lastEntities[entityName];
val = val.replace( entity.regex, entity.val);
}
if(this.options.htmlEntities){
for(let entityName in this.htmlEntities){
const entity = this.htmlEntities[entityName];
val = val.replace( entity.regex, entity.val);
}
}
val = val.replace( this.ampEntity.regex, this.ampEntity.val);
}
return val;
}
function saveTextToParentTag(textData, currentNode, jPath, isLeafNode) {
if (textData) { //store previously collected data as textNode
if(isLeafNode === undefined) isLeafNode = currentNode.child.length === 0
textData = this.parseTextData(textData,
currentNode.tagname,
jPath,
false,
currentNode[":@"] ? Object.keys(currentNode[":@"]).length !== 0 : false,
isLeafNode);
if (textData !== undefined && textData !== "")
currentNode.add(this.options.textNodeName, textData);
textData = "";
}
return textData;
}
//TODO: use jPath to simplify the logic
/**
*
* @param {string[]} stopNodes
* @param {string} jPath
* @param {string} currentTagName
*/
function isItStopNode(stopNodes, jPath, currentTagName){
const allNodesExp = "*." + currentTagName;
for (const stopNodePath in stopNodes) {
const stopNodeExp = stopNodes[stopNodePath];
if( allNodesExp === stopNodeExp || jPath === stopNodeExp ) return true;
}
return false;
}
/**
* Returns the tag Expression and where it is ending handling single-double quotes situation
* @param {string} xmlData
* @param {number} i starting index
* @returns
*/
function tagExpWithClosingIndex(xmlData, i, closingChar = ">"){
let attrBoundary;
let tagExp = "";
for (let index = i; index < xmlData.length; index++) {
let ch = xmlData[index];
if (attrBoundary) {
if (ch === attrBoundary) attrBoundary = "";//reset
} else if (ch === '"' || ch === "'") {
attrBoundary = ch;
} else if (ch === closingChar[0]) {
if(closingChar[1]){
if(xmlData[index + 1] === closingChar[1]){
return {
data: tagExp,
index: index
}
}
}else{
return {
data: tagExp,
index: index
}
}
} else if (ch === '\t') {
ch = " "
}
tagExp += ch;
}
}
function findClosingIndex(xmlData, str, i, errMsg){
const closingIndex = xmlData.indexOf(str, i);
if(closingIndex === -1){
throw new Error(errMsg)
}else{
return closingIndex + str.length - 1;
}
}
function readTagExp(xmlData,i, removeNSPrefix, closingChar = ">"){
const result = tagExpWithClosingIndex(xmlData, i+1, closingChar);
if(!result) return;
let tagExp = result.data;
const closeIndex = result.index;
const separatorIndex = tagExp.search(/\s/);
let tagName = tagExp;
let attrExpPresent = true;
if(separatorIndex !== -1){//separate tag name and attributes expression
tagName = tagExp.substring(0, separatorIndex);
tagExp = tagExp.substring(separatorIndex + 1).trimStart();
}
const rawTagName = tagName;
if(removeNSPrefix){
const colonIndex = tagName.indexOf(":");
if(colonIndex !== -1){
tagName = tagName.substr(colonIndex+1);
attrExpPresent = tagName !== result.data.substr(colonIndex + 1);
}
}
return {
tagName: tagName,
tagExp: tagExp,
closeIndex: closeIndex,
attrExpPresent: attrExpPresent,
rawTagName: rawTagName,
}
}
/**
* find paired tag for a stop node
* @param {string} xmlData
* @param {string} tagName
* @param {number} i
*/
function readStopNodeData(xmlData, tagName, i){
const startIndex = i;
// Starting at 1 since we already have an open tag
let openTagCount = 1;
for (; i < xmlData.length; i++) {
if( xmlData[i] === "<"){
if (xmlData[i+1] === "/") {//close tag
const closeIndex = findClosingIndex(xmlData, ">", i, `${tagName} is not closed`);
let closeTagName = xmlData.substring(i+2,closeIndex).trim();
if(closeTagName === tagName){
openTagCount--;
if (openTagCount === 0) {
return {
tagContent: xmlData.substring(startIndex, i),
i : closeIndex
}
}
}
i=closeIndex;
} else if(xmlData[i+1] === '?') {
const closeIndex = findClosingIndex(xmlData, "?>", i+1, "StopNode is not closed.")
i=closeIndex;
} else if(xmlData.substr(i + 1, 3) === '!--') {
const closeIndex = findClosingIndex(xmlData, "-->", i+3, "StopNode is not closed.")
i=closeIndex;
} else if(xmlData.substr(i + 1, 2) === '![') {
const closeIndex = findClosingIndex(xmlData, "]]>", i, "StopNode is not closed.") - 2;
i=closeIndex;
} else {
const tagData = readTagExp(xmlData, i, '>')
if (tagData) {
const openTagName = tagData && tagData.tagName;
if (openTagName === tagName && tagData.tagExp[tagData.tagExp.length-1] !== "/") {
openTagCount++;
}
i=tagData.closeIndex;
}
}
}
}//end for loop
}
function parseValue(val, shouldParse, options) {
if (shouldParse && typeof val === 'string') {
//console.log(options)
const newval = val.trim();
if(newval === 'true' ) return true;
else if(newval === 'false' ) return false;
else return toNumber(val, options);
} else {
if (isExist(val)) {
return val;
} else {
return '';
}
}
}