tei-util
Version:
Utility functions for dealing with TEI in JavaScript
383 lines (362 loc) • 15.9 kB
text/typescript
import deepcopy from 'deepcopy';
import { TEIConfig, TEIConfigSection, TEIConfigSerialiseAttribute, TEIConfigSerialiseElement, TEIDocument, TEIMetadataNode, TEITextDocumentCollection, TEITextMark, TEITextNode } from './types';
export type XMLElement = {
tag: string;
children: XMLElement[];
text: string | null;
attributes: {[x:string]: string};
}
type ElementSchema = {
serialise: TEIConfigSerialiseElement,
attributes: string[]
};
type ElementSchemaDict = {
[key: string]: ElementSchema;
};
type AttributeSchema = {
serialise: TEIConfigSerialiseAttribute;
}
type AttributeSchemaDict = {
[key: string]: AttributeSchema;
}
export class TEISerialiser {
private sections = [] as TEIConfigSection[];
private elements = {} as ElementSchemaDict;
private attributes = {} as AttributeSchemaDict;
constructor(config: TEIConfig) {
this.sections = config.sections;
this.elements = Object.fromEntries(config.elements.map((element) => {
if (element.serialise) {
return [element.name, {
serialise: element.serialise,
attributes: element.attrs,
}];
} else {
return [element.name, null];
}
}).filter((entry) => { return entry[1]; }));
this.attributes = Object.fromEntries(config.attributes.map((attribute) => {
return [attribute.name, {
serialise: attribute.serialise,
}];
}));
}
serialise(doc: TEIDocument) {
doc = deepcopy(doc);
const tree = this.buildBaseElement('tei:TEI[@xmlns:tei="http://www.tei-c.org/ns/1.0"]')
for (let section of this.sections) {
if (section.type === 'text' && section.serialise && doc[section.name]) {
const [root, leaf] = this.buildElement(section.serialise.element);
this.serialiseNestedDocs(leaf, this.textDoc2tree(leaf, (doc[section.name] as TEITextDocumentCollection).main), doc[section.name] as TEITextDocumentCollection);
this.mergeTree(tree, root);
} else if (section.type === 'metadata' && doc[section.name]) {
this.mergeTree(tree, doc[section.name] as TEIMetadataNode);
}
}
const lines = [
'<?xml version="1.0" encoding="UTF-8"?>',
];
this.serialiseTree(tree, lines);
return lines.join('\n');
}
private serialiseNestedDocs(leaf: XMLElement, nestedIds: string[], sections: TEITextDocumentCollection) {
const processedIds = [];
const tmpRoot = {
tag: '',
children: [],
text: '',
attributes: {},
} as XMLElement;
while (nestedIds.length > 0) {
const nestedId = nestedIds.pop();
if (nestedId && processedIds.indexOf(nestedId) < 0 && sections.nested) {
Object.entries(sections.nested).forEach(([key, section]) => {
if (section[nestedId]) {
const nestedDoc = section[nestedId];
nestedDoc.doc.type = nestedDoc.type;
nestedDoc.doc.attrs = {xmlid: nestedId};
nestedIds = nestedIds.concat(this.textDoc2tree(tmpRoot, nestedDoc.doc));
}
});
processedIds.push(nestedId);
}
}
tmpRoot.children.sort((a, b) => {
if (a.attributes['xml:id'].indexOf('-') >= 0 && b.attributes['xml:id'].indexOf('-')) {
const [aKey, aValue] = a.attributes['xml:id'].split('-');
const [bKey, bValue] = b.attributes['xml:id'].split('-');
if (aKey == bKey) {
return Number.parseInt(aValue) - Number.parseInt(bValue);
} else if (aKey > bKey) {
return 1;
} else if (aKey < bKey) {
return -1;
} else {
return 0;
}
} else {
if (a.attributes['xml:id'] > b.attributes['xml:id']) {
return 1;
} else if (a.attributes['xml:id'] < b.attributes['xml:id']) {
return -1;
} else {
return 0;
}
}
});
tmpRoot.children.forEach((child) => {
leaf.children.push(child);
});
}
private buildBaseElement(path: string): XMLElement {
let tag = path;
if (tag.indexOf('[') >= 0) {
tag = tag.substring(0, tag.indexOf('['));
}
const element = {
tag: tag,
children: [],
text: null,
attributes: {},
} as XMLElement;
if (path.indexOf('[') >= 0) {
for (let expression of path.split('[')) {
for (let constraint of expression.split(' and ')) {
if (constraint.startsWith('@')) {
if (constraint.endsWith(']')) {
constraint = constraint.substring(1, constraint.length - 1);
}
let [name, value] = constraint.split('=');
value = value.replace(/"/g, '');
element.attributes[name.trim()] = value.trim();
}
}
}
}
return element;
}
private buildElement(path: string): [XMLElement, XMLElement] {
const parts = path.split('/');
const root = this.buildBaseElement(parts[0]);
let leaf = root;
for (let idx = 1; idx < parts.length; idx++) {
leaf.children.push(this.buildBaseElement(parts[idx]));
leaf = leaf.children[0];
}
return [root, leaf];
}
private buildAttributes(nodeOrMark: TEITextNode | TEITextMark, schema: ElementSchema): {[x: string]: string} {
const attributes = {} as {[key: string]: string};
if (schema.attributes && nodeOrMark.attrs) {
for (let name of schema.attributes) {
const attributeSchema = this.attributes[name];
if (attributeSchema && attributeSchema.serialise) {
const serialise = attributeSchema.serialise;
if (serialise.attribute && serialise.values) {
if (serialise.values[nodeOrMark.attrs[name]]) {
attributes[serialise.attribute] = serialise.values[nodeOrMark.attrs[name]];
}
} else if (serialise.attribute && serialise.value) {
if (serialise.value === 'value()') {
if (nodeOrMark.attrs[name]) {
attributes[serialise.attribute] = nodeOrMark.attrs[name];
}
} else if (nodeOrMark.attrs[name]) {
attributes[serialise.attribute] = serialise.value.replace(/\{value\(\)\}/g, nodeOrMark.attrs[name]);
} else {
attributes[serialise.attribute] = serialise.value;
}
}
}
}
}
return attributes;
}
private buildNodeElement(node: TEITextNode, schema: ElementSchema): XMLElement {
const element = this.buildBaseElement(schema.serialise.element);
if (schema.serialise.text) {
if (schema.serialise.text === 'text()' && node.text) {
element.text = node.text;
}
}
element.attributes = { ...element.attributes, ...this.buildAttributes(node, schema)};
return element;
}
private buildMarkElement(mark: TEITextMark, schema: ElementSchema): XMLElement {
const element = this.buildBaseElement(schema.serialise.element);
element.attributes = { ...element.attributes, ...this.buildAttributes(mark, schema)};
return element;
}
private textDoc2tree(leaf: XMLElement, node: TEITextNode): string[] {
let nestedIds = [] as string[];
const elementSchema = this.elements[node.type];
if (elementSchema && elementSchema.serialise) {
if (node.marks && node.marks.length > 0) {
const markElements = node.marks.map((mark) => {
const markSchema = this.elements[mark.type];
if (markSchema && markSchema.serialise) {
if (markSchema.attributes && markSchema.attributes.indexOf('nestedTarget') >= 0) {
nestedIds.push(mark.attrs.nestedTarget);
}
return [this.buildMarkElement(mark, markSchema), markSchema];
}
}).filter((mark) => { return mark; }) as [XMLElement, ElementSchema][];
if (markElements.length > 0) {
markElements.sort(([elementA, schemaA], [elementB, schemaB]) => {
if (schemaA.serialise.weight && schemaB.serialise.weight) {
if (schemaA.serialise.weight > 0 && schemaB.serialise.weight > 0) {
if (schemaA.serialise.weight > schemaB.serialise.weight) {
return 1;
} else if (schemaA.serialise.weight < schemaB.serialise.weight) {
return -1;
}
} else if (schemaA.serialise.weight < 0 && schemaB.serialise.weight < 0) {
if (schemaA.serialise.weight > schemaB.serialise.weight) {
return -1;
} else if (schemaA.serialise.weight < schemaB.serialise.weight) {
return 1;
}
} else if (schemaA.serialise.weight < 0 && schemaB.serialise.weight > 0) {
return 1;
} else if (schemaA.serialise.weight > 0 && schemaB.serialise.weight < 0) {
return -1;
}
} else if (schemaA.serialise.weight) {
if (schemaA.serialise.weight > 0) {
return -1;
} else if (schemaA.serialise.weight < 0) {
return 1;
}
} else if (schemaB.serialise.weight) {
if (schemaB.serialise.weight > 0) {
return 1;
} else if (schemaB.serialise.weight < 0) {
return -1;
}
}
if (elementA.tag > elementB.tag) {
return 1;
} else if (elementA.tag < elementB.tag) {
return -1;
} else {
return 0;
}
});
let parent = leaf;
markElements.forEach(([element, schema], idx) => {
if (idx === markElements.length - 1) {
if (schema.serialise.text && schema.serialise.text === 'text()' && node.text) {
element.text = node.text;
}
Object.keys(element.attributes).forEach((key) => {
if (element.attributes[key]) {
element.attributes[key] = element.attributes[key].replace(/\{text\(\)\}/g, node.text as string);
}
});
}
parent.children.push(element);
parent = element;
});
}
} else {
const element = this.buildNodeElement(node, elementSchema);
leaf.children.push(element);
if (node.content) {
for (let child of node.content) {
nestedIds = nestedIds.concat(this.textDoc2tree(element, child));
}
}
}
} else {
for (let child of node.content) {
nestedIds = nestedIds.concat(this.textDoc2tree(leaf, child));
}
}
return nestedIds;
}
private mergeTree(base: XMLElement, merge: XMLElement) {
if (base.tag === merge.tag) {
base.attributes = { ...base.attributes, ...merge.attributes };
for (let mergeChild of merge.children) {
const found = base.children.filter((child) => { return child.tag === mergeChild.tag; });
if (found.length > 0) {
this.mergeTree(found[0], mergeChild);
} else {
base.children.push(mergeChild);
}
}
} else {
const found = base.children.filter((child) => { return child.tag === merge.tag; });
if (found.length > 0) {
this.mergeTree(found[0], merge);
} else {
base.children.push(merge);
}
}
}
private serialiseTree(element: XMLElement, lines: string[], indent?: string | undefined) {
if (indent === undefined) {
indent = '';
}
const opening = [indent, '<', element.tag];
const attributes = Object.entries(element.attributes);
if (attributes.length > 0) {
attributes.sort(([keyA, valueA], [keyB, valueB]) => {
if (keyA > keyB) {
return 1;
} else if (keyA < keyB) {
return -1;
} else {
return 0;
}
});
for (let [name, value] of attributes) {
opening.push(' ');
opening.push(name);
opening.push('="');
opening.push(this.xmlEscape(value));
opening.push('"');
}
}
opening.push('>');
if (element.text !== null) {
if (element.text !== '') {
opening.push(this.xmlEscape(element.text));
opening.push('</' + element.tag + '>');
} else {
opening.pop();
opening.push('/>');
}
lines.push(opening.join(''));
} else {
if (element.children.length > 0) {
lines.push(opening.join(''));
for (let child of element.children) {
this.serialiseTree(child, lines, indent + ' ');
}
lines.push(indent + '</' + element.tag + '>');
} else {
opening.pop();
opening.push('/>');
lines.push(opening.join(''));
}
}
}
private xmlEscape(str: string): string {
return str.replace(/["'<>&]/g, (value) => {
if (value === '&') {
return '&';
} else if (value === '<') {
return '<';
} else if (value === '>') {
return '>';
} else if (value === '"') {
return '"';
} else if (value === "'") {
return ''';
} else {
return value;
}
});
}
}