UNPKG

diffusion

Version:

Diffusion JavaScript client

523 lines (451 loc) • 19.1 kB

JavaScript

/*eslint valid-jsdoc: "off"*/ var JSONPointerMap = require('data/json/json-pointer-map'); var JSONPointer = require('data/json/json-pointer'); var SpanParser = require('data/json/span-parser'); function isPartOf(original, other, start, length) { return original.equalBytes(other.$buffer, other.$offset + start, length); } module.exports = function JSONDeltaImpl(factory, original, newValue, binaryDelta) { var inserted = new JSONPointerMap(); var removed = new JSONPointerMap(); function partOf(value, start, length) { return new factory(value.$buffer, value.$offset + start, length); } function copyPartOf(value, start, length) { var offsetStart = value.$offset + start; var buffer = new Buffer(length); value.$buffer.copy(buffer, 0, offsetStart, offsetStart + length); return new factory(buffer, 0, length); } if (binaryDelta !== undefined) { binaryDelta.visit(new DeltaVisitor(original, newValue, inserted, removed, partOf, copyPartOf)); } else { inserted.put(JSONPointer.ROOT, original); removed.put(JSONPointer.ROOT, newValue); } this.removed = function() { return new ChangeMapImpl(removed); }; this.inserted = function() { return new ChangeMapImpl(inserted); }; this.hasChanges = function() { return removed.size !== 0 || inserted.size !== 0; }; this.toString = function() { return ['REMOVE ', removed, ' INSERT ', inserted].join(''); }; }; /** * Incrementally parse the binary delta to calculate a structural delta. * * <p> * The result reports all differences, but is not necessarily minimal. In * particular there are occasional "false positive" REMOVE/INSERT pairs. * * <h2>The Algorithm</h2> * * <p> * We re-constitute the match and insert visitor callbacks into a sequence * of delete, insert, and match edits. The binary delta format guarantees * there will not be consecutive callbacks of the same type, and that match * callbacks are presented in order with no overlaps. Delete and match * edits will be contiguous with no gap between the end of one edit and * the start of the next. * <p> * The processing uses two SpanParsers, one for the old token stream and one * for the new token streams. We move forward along each parser driven by * the binary delta edits, skipping old values covered by a match, adding a * REMOVE change for each old value covered by a delete, and adding an * INSERT change for each new value covered by an insert. * * <p> * The SpanParsers calculate the appropriate JSON pointers to return for a * span. A span finishes at the end of the first token found after the * target offset, with these exceptions: * <ol> * * <li>If a split is found in a field name, the span continues until the * first pointer is found in the value, potentially consuming one or more * structure start tokens. The reporting of a change that affects a field * does not distinguish between the field name and its value, except if the * value is a structure and the change only affects part of the structure. * * <li>End structure tokens are eagerly consumed. This has two benefits. * First, it collapses empty structures to the appropriate parent pointer. * Second, the closing tokens of non-empty structures are associated with * the last pointer, which simplifies boundary detection. * * </ol> * * <p> * The implementation relies on the SpanParser not consuming start structure * tokens eagerly at a split unless instructed to do so (see * {@link SpanParser#spanToNext()}). This provides a tighter alignment * between the detected binary differences and the token parsers, allowing a * simpler pairing of tokens between the two streams. I'm not certain there * aren't additional edge cases introduced by the eager consumption of * structure tokens for field names. * * <p> * The two token streams are treated symmetrically. Each SpanParser is only * moved in a forward direction; there is no backtracking. We maintain the * current byte position in each binary value as {@link #oldOffset} and * {@link #newOffset}. After each edit is processed, * <code>oldParser.nextByte() >= oldOffset</code> and * <code>newParser.nextByte() >= newOffset</code>. * * <p> * The fun happens at the edges, specifically where a token covers one or * more binary edits. We call such a misaligned edge a "split". * * <h3>Splits</h3> * * <p> * The appropriate processing of a split depends on the type of edit in * which it is found. For all edit types, we detect and process * <em>trailing edge</em> splits; i.e. splits detected at the end of a span * because the end of the last parsed token is after the end edge. It turns * out we must also consider <em>leading edge</em> splits for match edits. * * <h4>insert edits</h4> * * <p> * We insert everything in the span. If a trailing edge split is detected, * we do one of two things. * <ul> * <li>If the old stream parser is at a split, there is nothing to do. * The split will have been considered and processed for an earlier edit. * * <li>If the old stream parser is at <code>oldOffset</code> (i.e. is not at a * split token), we consume and add a REMOVE change for the next pointer in * the old stream. The old stream is otherwise unaffected by the insert * edit, so <code>oldOffset</code> is the start of the next delete or match * edit. Consuming the token might move it further into other delete or * match edits. In all cases, correct handling of the insert split requires * adding a REMOVE change for the first pointer found. * </ul> * * <p> * Further split detection is required to detect differences that only * affect the CBOR structure. If the parser structure depth is different at * the end of the span than it was at start, the last token is treated in * the same manner as a trailing edge split. Otherwise comparing "b" with * <code>["a", "b", "c"]</code> incorrectly generates * <code>INSERT /0,INSERT /2</code> rather than * <code>INSERT /0,REMOVE /1,INSERT /1,INSERT /2</code>. * * <h4>delete edits</h4> * * <p> * The processing of splits for delete edits is the inverse of that for * insert edits. (Swap INSERT and REMOVE, old stream and new stream). * * <h4>match edits</h4> * * <p> * When processing match edits we move both parsers. Either or both can * have a trailing edge split. If both are split, the corresponding pointers * are used for the REMOVE/INSERT pair. If neither are split, or if only one * is split, we do nothing. As far as I can tell, there can only be a single * split if the match ends with a start structure token (<code>{</code> or * <code>[</code>), so differences in the structure content will be detected * by later edits. * * <p> * I haven't fully convinced myself that it is correct to do nothing in all * single split cases, but it appears to work. I've experimented with a * variant where a single split matches the next pointer eagerly consumed * from the opposite stream, similar to the approach taken for * insert/delete. This produces less satisfactory matching between * structures. E.g. for <code>[]</code> with <code>["a" "b"]</code> it produces * <code>REMOVE /0,INSERT /0/0,INSERT/0/1</code> where the current * implementation produces <code>INSERT/0/0,INSERT/0/1</code>. It may be * worth revisiting this in conjunction with the post-processing fix to * collapse complete spans into the parent. * * <p> * Consideration of parser depth is unnecessary for match trailing edge * splits. A difference in structure depth must correspond to a binary * difference covering structure delimiters, which will be handled by insert * or delete processing. * * <p> * There is a further complication to deal with for matches. A token in one * stream can match the end and start of two tokens in the other stream. We * deal with that by checking for a leading edge split. If the new stream * parser is at the expected offset, and the old stream parser isn't, we've * found a token spanning across two matches in the old stream. The previous * insert edit will have inserted the first pointer for the new stream. We * add the next found in the new stream in the match span (if any). Similar * processing is performed if the old stream parser is at the expected * offset, but the new stream parser is not. * * <h3>Heuristic clean up</h3> * * <p> * The basic approach of using the binary delta to identify splits and * matching the splits to the appropriate tokens produces a reasonable but * imperfect result. Further processing is performed to improve the output. * * <p> * False positives (redundant REMOVE/INSERT pairs) occur reasonably * frequently in the raw results. These are identified and removed by * comparing each potential REMOVE with the previous INSERT, and each * potential INSERT with the previous REMOVE. If the associated values are * equal, and the pointers are compatible, both changes are dropped. * * <p> * The raw results might contain pointers to the entire contents of a * structure. These are replaced with a single pointer to the structure. */ function DeltaVisitor(oldValue, newValue, inserted, removed, partOf, copyPartOf) { var removedSplitStructures = new JSONPointerMap(); var insertedSplitStructures = new JSONPointerMap(); var oldParser = new SpanParser(oldValue); var newParser = new SpanParser(newValue); /** * The end of the last match edit. After each edit is processed, * oldParser.nextByte() >= oldOffset. */ var oldOffset = 0; /** * The newParser position where the next match or insert will start, * which is the cumulative length of the match and insert edits. May be * less than or greater than oldOffset. After each edit is processed, * newParser.nextByte() >= newOffset. */ var newOffset = 0; var pendingRemove = null; var pendingRemoveValue; var pendingInsert = null; var pendingInsertValue; this.match = function(start, length) { handleDelete(oldOffset, start - oldOffset); handleMatch(start, length); return true; }; function handleDelete(start, length) { checkInvariants(); var end = start + length; if (oldParser.nextByte() < end && (oldParser.spanTo(end, remover) !== 0 || oldParser.nextByte() > end)) { // The end is split, If newParser is not split, insert the // next pointer found. This will process at most one pointer // because newParser.nextByte() >= newOffset, so we can use // the stateless insert consumer. newParser.spanToNext(newOffset + 1, inserter); } } function handleMatch(start, length) { checkInvariants(); var newStart = newOffset; var end = start + length; newOffset += length; oldOffset = end; var oldNextByte = oldParser.nextByte(); var newNextByte = newParser.nextByte(); if (newNextByte > newStart && oldNextByte === start) { // New stream split affects two tokens in old stream. Remove // the second. This will remove exactly one pointer. oldParser.spanToNext(start + 1, remover); } else if (oldNextByte > start && newNextByte === newStart) { // Old stream split affects two tokens in new stream. Insert // the second. This will insert exactly one pointer. newParser.spanToNext(newStart + 1, inserter); } var lastOld = new LastResult(removedSplitStructures); var lastNew = new LastResult(insertedSplitStructures); oldParser.spanTo(end, lastOld); newParser.spanTo(newOffset, lastNew); var oldSplit = lastOld.foundLast() && oldParser.nextByte() > end; var newSplit = lastNew.foundLast() && newParser.nextByte() > newOffset; if (oldSplit && newSplit) { lastOld.consumeLast(remover); lastNew.consumeLast(inserter); } } this.insert = function(bytes) { checkInvariants(); newOffset += bytes.length; if (newParser.nextByte() < newOffset && (newParser.spanTo(newOffset, inserter) !== 0 || newParser.nextByte() > newOffset)) { // The end is split. Iff oldParser is not split, remove the // next pointer found. This will process at most one pointer // because/ oldParser.nextByte() >= oldOffset, so we // can use the stateless remove consumer. oldParser.spanToNext(oldOffset + 1, remover); } return true; }; this.end = function() { handleDelete(oldOffset, oldValue.$length - oldOffset); addInsert(null, null); addRemove(null, null); replaceFullRemovedStructures(); replaceFullInsertedStructures(); }; function addInsert(nextPointer, nextValue) { if (pendingInsert !== null) { inserted.put(pendingInsert, pendingInsertValue); } pendingInsert = nextPointer; pendingInsertValue = nextValue; } function addRemove(nextPointer, nextValue) { if (pendingRemove !== null) { removed.put(pendingRemove, pendingRemoveValue); } pendingRemove = nextPointer; pendingRemoveValue = nextValue; } this.noChange = function() { }; function checkInvariants() { if (oldParser.nextByte() < oldOffset || newParser.nextByte() < newOffset) { throw new Error("Invalid binary delta"); } } /** * Heuristic post-processing: replace split structures for which removed * includes every entry with a single pointer. */ function replaceFullRemovedStructures() { var i = removedSplitStructures.postOrder(); while (i.hasNext()) { var s = i.next(); var split = s.value; var entry = removed.getEntry(s.pointer); if (entry !== null && entry.numberOfChildren() === split.elements) { entry.setValue(copyPartOf(oldValue, split.start, split.length)); entry.removeDescendants(); } } } /** * Heuristic post-processing: replace split structures for which * inserted includes every entry with a single pointer. */ function replaceFullInsertedStructures() { var i = insertedSplitStructures.postOrder(); while (i.hasNext()) { var s = i.next(); var split = s.value; var entry = inserted.getEntry(s.pointer); if (entry !== null && entry.numberOfChildren() === split.elements) { entry.setValue(partOf(newValue, split.start, split.length)); entry.removeDescendants(); } } } var inserter = { accept : function(pointer, start, length) { if (pendingRemove !== null && pendingRemove.equalIgnoringIndexes(pointer) && isPartOf(pendingRemoveValue, newValue, start, length)) { // Value to insert is equal to the pending removed value // and their pointers are sufficiently similar. Treat as a // false positive and discard both. pendingRemove = null; pendingRemoveValue = null; } else { // Share the underlying bytes to avoid copying at the // expense of pinning a potentially large array in memory. // We expect the user to be using the structural delta to // decide whether to take action, then throwing everything // away if not. If it turns out that users often keep the // structural delta but not the value it might be worth // providing API control. addInsert(pointer, partOf(newValue, start, length)); // Flush pending remove. Subsequent inserts are less likely // to match it. addRemove(null, null); } }, splitStructureEnd : function(pointer, count, start, length) { insertedSplitStructures.put(pointer, { start : start, length : length, elements : count }); } }; var remover = { accept : function(pointer, start, length) { if (pendingInsert !== null && pendingInsert.equalIgnoringIndexes(pointer) && isPartOf(pendingInsertValue, oldValue, start, length)) { // Value to remove is equal to the pending inserted value // and their pointers are sufficiently similar. Treat as a // false positive and discard both. pendingInsert = null; pendingInsertValue = null; } else { // In contrast to the insert consumer, we copy byte ranges // to avoid pinning the entire old byte array in memory. addRemove(pointer, copyPartOf(oldValue, start, length)); // Flush pending insert. Subsequent removes are less likely // to match it. addInsert(null, null); } }, splitStructureEnd : function(pointer, count, start, length) { removedSplitStructures.put(pointer, { start : start, length : length, elements : count }); } }; } function LastResult(splitStructures) { var last; var lastStart; var lastLength; this.accept = function (pointer, start, length) { last = pointer; lastStart = start; lastLength = length; }; this.foundLast = function () { return !!last; }; this.consumeLast = function (delegate) { delegate.accept(last, lastStart, lastLength); }; this.splitStructureEnd = function (pointer, count, start, length) { splitStructures.put(pointer, { start: start, length: length, elements: count }); }; } function ChangeMapImpl(parts) { var entrySet = []; var i = parts.iterator(); while (i.hasNext()) { var n = i.next(); entrySet.push({ key : n.pointer.toString(), value : n.value.get() }); } this.length = entrySet.length; this.get = function(key) { return parts.get(JSONPointer.parse(key)).get(); }; this.entrySet = function() { return entrySet; }; this.containsKey = function(key) { return parts.contains(JSONPointer.parse(key)); }; this.descendants = function(pointer) { return new ChangeMapImpl(parts.descendants(JSONPointer.parse(pointer))); }; this.intersection = function(pointer) { return new ChangeMapImpl(parts.intersection(JSONPointer.parse(pointer))); }; }