diffusion
Version:
Diffusion JavaScript client
523 lines (451 loc) • 19.1 kB
JavaScript
/*eslint valid-jsdoc: "off"*/
var JSONPointerMap = require('data/json/json-pointer-map');
var JSONPointer = require('data/json/json-pointer');
var SpanParser = require('data/json/span-parser');
function isPartOf(original, other, start, length) {
return original.equalBytes(other.$buffer, other.$offset + start, length);
}
module.exports = function JSONDeltaImpl(factory, original, newValue, binaryDelta) {
var inserted = new JSONPointerMap();
var removed = new JSONPointerMap();
function partOf(value, start, length) {
return new factory(value.$buffer, value.$offset + start, length);
}
function copyPartOf(value, start, length) {
var offsetStart = value.$offset + start;
var buffer = new Buffer(length);
value.$buffer.copy(buffer, 0, offsetStart, offsetStart + length);
return new factory(buffer, 0, length);
}
if (binaryDelta !== undefined) {
binaryDelta.visit(new DeltaVisitor(original, newValue, inserted, removed, partOf, copyPartOf));
} else {
inserted.put(JSONPointer.ROOT, original);
removed.put(JSONPointer.ROOT, newValue);
}
this.removed = function() {
return new ChangeMapImpl(removed);
};
this.inserted = function() {
return new ChangeMapImpl(inserted);
};
this.hasChanges = function() {
return removed.size !== 0 || inserted.size !== 0;
};
this.toString = function() {
return ['REMOVE ', removed, ' INSERT ', inserted].join('');
};
};
/**
* Incrementally parse the binary delta to calculate a structural delta.
*
* <p>
* The result reports all differences, but is not necessarily minimal. In
* particular there are occasional "false positive" REMOVE/INSERT pairs.
*
* <h2>The Algorithm</h2>
*
* <p>
* We re-constitute the match and insert visitor callbacks into a sequence
* of delete, insert, and match edits. The binary delta format guarantees
* there will not be consecutive callbacks of the same type, and that match
* callbacks are presented in order with no overlaps. Delete and match
* edits will be contiguous with no gap between the end of one edit and
* the start of the next.
* <p>
* The processing uses two SpanParsers, one for the old token stream and one
* for the new token streams. We move forward along each parser driven by
* the binary delta edits, skipping old values covered by a match, adding a
* REMOVE change for each old value covered by a delete, and adding an
* INSERT change for each new value covered by an insert.
*
* <p>
* The SpanParsers calculate the appropriate JSON pointers to return for a
* span. A span finishes at the end of the first token found after the
* target offset, with these exceptions:
* <ol>
*
* <li>If a split is found in a field name, the span continues until the
* first pointer is found in the value, potentially consuming one or more
* structure start tokens. The reporting of a change that affects a field
* does not distinguish between the field name and its value, except if the
* value is a structure and the change only affects part of the structure.
*
* <li>End structure tokens are eagerly consumed. This has two benefits.
* First, it collapses empty structures to the appropriate parent pointer.
* Second, the closing tokens of non-empty structures are associated with
* the last pointer, which simplifies boundary detection.
*
* </ol>
*
* <p>
* The implementation relies on the SpanParser not consuming start structure
* tokens eagerly at a split unless instructed to do so (see
* {@link SpanParser#spanToNext()}). This provides a tighter alignment
* between the detected binary differences and the token parsers, allowing a
* simpler pairing of tokens between the two streams. I'm not certain there
* aren't additional edge cases introduced by the eager consumption of
* structure tokens for field names.
*
* <p>
* The two token streams are treated symmetrically. Each SpanParser is only
* moved in a forward direction; there is no backtracking. We maintain the
* current byte position in each binary value as {@link #oldOffset} and
* {@link #newOffset}. After each edit is processed,
* <code>oldParser.nextByte() >= oldOffset</code> and
* <code>newParser.nextByte() >= newOffset</code>.
*
* <p>
* The fun happens at the edges, specifically where a token covers one or
* more binary edits. We call such a misaligned edge a "split".
*
* <h3>Splits</h3>
*
* <p>
* The appropriate processing of a split depends on the type of edit in
* which it is found. For all edit types, we detect and process
* <em>trailing edge</em> splits; i.e. splits detected at the end of a span
* because the end of the last parsed token is after the end edge. It turns
* out we must also consider <em>leading edge</em> splits for match edits.
*
* <h4>insert edits</h4>
*
* <p>
* We insert everything in the span. If a trailing edge split is detected,
* we do one of two things.
* <ul>
* <li>If the old stream parser is at a split, there is nothing to do.
* The split will have been considered and processed for an earlier edit.
*
* <li>If the old stream parser is at <code>oldOffset</code> (i.e. is not at a
* split token), we consume and add a REMOVE change for the next pointer in
* the old stream. The old stream is otherwise unaffected by the insert
* edit, so <code>oldOffset</code> is the start of the next delete or match
* edit. Consuming the token might move it further into other delete or
* match edits. In all cases, correct handling of the insert split requires
* adding a REMOVE change for the first pointer found.
* </ul>
*
* <p>
* Further split detection is required to detect differences that only
* affect the CBOR structure. If the parser structure depth is different at
* the end of the span than it was at start, the last token is treated in
* the same manner as a trailing edge split. Otherwise comparing "b" with
* <code>["a", "b", "c"]</code> incorrectly generates
* <code>INSERT /0,INSERT /2</code> rather than
* <code>INSERT /0,REMOVE /1,INSERT /1,INSERT /2</code>.
*
* <h4>delete edits</h4>
*
* <p>
* The processing of splits for delete edits is the inverse of that for
* insert edits. (Swap INSERT and REMOVE, old stream and new stream).
*
* <h4>match edits</h4>
*
* <p>
* When processing match edits we move both parsers. Either or both can
* have a trailing edge split. If both are split, the corresponding pointers
* are used for the REMOVE/INSERT pair. If neither are split, or if only one
* is split, we do nothing. As far as I can tell, there can only be a single
* split if the match ends with a start structure token (<code>{</code> or
* <code>[</code>), so differences in the structure content will be detected
* by later edits.
*
* <p>
* I haven't fully convinced myself that it is correct to do nothing in all
* single split cases, but it appears to work. I've experimented with a
* variant where a single split matches the next pointer eagerly consumed
* from the opposite stream, similar to the approach taken for
* insert/delete. This produces less satisfactory matching between
* structures. E.g. for <code>[]</code> with <code>["a" "b"]</code> it produces
* <code>REMOVE /0,INSERT /0/0,INSERT/0/1</code> where the current
* implementation produces <code>INSERT/0/0,INSERT/0/1</code>. It may be
* worth revisiting this in conjunction with the post-processing fix to
* collapse complete spans into the parent.
*
* <p>
* Consideration of parser depth is unnecessary for match trailing edge
* splits. A difference in structure depth must correspond to a binary
* difference covering structure delimiters, which will be handled by insert
* or delete processing.
*
* <p>
* There is a further complication to deal with for matches. A token in one
* stream can match the end and start of two tokens in the other stream. We
* deal with that by checking for a leading edge split. If the new stream
* parser is at the expected offset, and the old stream parser isn't, we've
* found a token spanning across two matches in the old stream. The previous
* insert edit will have inserted the first pointer for the new stream. We
* add the next found in the new stream in the match span (if any). Similar
* processing is performed if the old stream parser is at the expected
* offset, but the new stream parser is not.
*
* <h3>Heuristic clean up</h3>
*
* <p>
* The basic approach of using the binary delta to identify splits and
* matching the splits to the appropriate tokens produces a reasonable but
* imperfect result. Further processing is performed to improve the output.
*
* <p>
* False positives (redundant REMOVE/INSERT pairs) occur reasonably
* frequently in the raw results. These are identified and removed by
* comparing each potential REMOVE with the previous INSERT, and each
* potential INSERT with the previous REMOVE. If the associated values are
* equal, and the pointers are compatible, both changes are dropped.
*
* <p>
* The raw results might contain pointers to the entire contents of a
* structure. These are replaced with a single pointer to the structure.
*/
function DeltaVisitor(oldValue, newValue, inserted, removed, partOf, copyPartOf) {
var removedSplitStructures = new JSONPointerMap();
var insertedSplitStructures = new JSONPointerMap();
var oldParser = new SpanParser(oldValue);
var newParser = new SpanParser(newValue);
/**
* The end of the last match edit. After each edit is processed,
* oldParser.nextByte() >= oldOffset.
*/
var oldOffset = 0;
/**
* The newParser position where the next match or insert will start,
* which is the cumulative length of the match and insert edits. May be
* less than or greater than oldOffset. After each edit is processed,
* newParser.nextByte() >= newOffset.
*/
var newOffset = 0;
var pendingRemove = null;
var pendingRemoveValue;
var pendingInsert = null;
var pendingInsertValue;
this.match = function(start, length) {
handleDelete(oldOffset, start - oldOffset);
handleMatch(start, length);
return true;
};
function handleDelete(start, length) {
checkInvariants();
var end = start + length;
if (oldParser.nextByte() < end &&
(oldParser.spanTo(end, remover) !== 0 || oldParser.nextByte() > end)) {
// The end is split, If newParser is not split, insert the
// next pointer found. This will process at most one pointer
// because newParser.nextByte() >= newOffset, so we can use
// the stateless insert consumer.
newParser.spanToNext(newOffset + 1, inserter);
}
}
function handleMatch(start, length) {
checkInvariants();
var newStart = newOffset;
var end = start + length;
newOffset += length;
oldOffset = end;
var oldNextByte = oldParser.nextByte();
var newNextByte = newParser.nextByte();
if (newNextByte > newStart && oldNextByte === start) {
// New stream split affects two tokens in old stream. Remove
// the second. This will remove exactly one pointer.
oldParser.spanToNext(start + 1, remover);
} else if (oldNextByte > start && newNextByte === newStart) {
// Old stream split affects two tokens in new stream. Insert
// the second. This will insert exactly one pointer.
newParser.spanToNext(newStart + 1, inserter);
}
var lastOld = new LastResult(removedSplitStructures);
var lastNew = new LastResult(insertedSplitStructures);
oldParser.spanTo(end, lastOld);
newParser.spanTo(newOffset, lastNew);
var oldSplit = lastOld.foundLast() && oldParser.nextByte() > end;
var newSplit = lastNew.foundLast() && newParser.nextByte() > newOffset;
if (oldSplit && newSplit) {
lastOld.consumeLast(remover);
lastNew.consumeLast(inserter);
}
}
this.insert = function(bytes) {
checkInvariants();
newOffset += bytes.length;
if (newParser.nextByte() < newOffset &&
(newParser.spanTo(newOffset, inserter) !== 0 || newParser.nextByte() > newOffset)) {
// The end is split. Iff oldParser is not split, remove the
// next pointer found. This will process at most one pointer
// because/ oldParser.nextByte() >= oldOffset, so we
// can use the stateless remove consumer.
oldParser.spanToNext(oldOffset + 1, remover);
}
return true;
};
this.end = function() {
handleDelete(oldOffset, oldValue.$length - oldOffset);
addInsert(null, null);
addRemove(null, null);
replaceFullRemovedStructures();
replaceFullInsertedStructures();
};
function addInsert(nextPointer, nextValue) {
if (pendingInsert !== null) {
inserted.put(pendingInsert, pendingInsertValue);
}
pendingInsert = nextPointer;
pendingInsertValue = nextValue;
}
function addRemove(nextPointer, nextValue) {
if (pendingRemove !== null) {
removed.put(pendingRemove, pendingRemoveValue);
}
pendingRemove = nextPointer;
pendingRemoveValue = nextValue;
}
this.noChange = function() {
};
function checkInvariants() {
if (oldParser.nextByte() < oldOffset ||
newParser.nextByte() < newOffset) {
throw new Error("Invalid binary delta");
}
}
/**
* Heuristic post-processing: replace split structures for which removed
* includes every entry with a single pointer.
*/
function replaceFullRemovedStructures() {
var i = removedSplitStructures.postOrder();
while (i.hasNext()) {
var s = i.next();
var split = s.value;
var entry = removed.getEntry(s.pointer);
if (entry !== null && entry.numberOfChildren() === split.elements) {
entry.setValue(copyPartOf(oldValue, split.start, split.length));
entry.removeDescendants();
}
}
}
/**
* Heuristic post-processing: replace split structures for which
* inserted includes every entry with a single pointer.
*/
function replaceFullInsertedStructures() {
var i = insertedSplitStructures.postOrder();
while (i.hasNext()) {
var s = i.next();
var split = s.value;
var entry = inserted.getEntry(s.pointer);
if (entry !== null && entry.numberOfChildren() === split.elements) {
entry.setValue(partOf(newValue, split.start, split.length));
entry.removeDescendants();
}
}
}
var inserter = {
accept : function(pointer, start, length) {
if (pendingRemove !== null &&
pendingRemove.equalIgnoringIndexes(pointer) &&
isPartOf(pendingRemoveValue, newValue, start, length)) {
// Value to insert is equal to the pending removed value
// and their pointers are sufficiently similar. Treat as a
// false positive and discard both.
pendingRemove = null;
pendingRemoveValue = null;
} else {
// Share the underlying bytes to avoid copying at the
// expense of pinning a potentially large array in memory.
// We expect the user to be using the structural delta to
// decide whether to take action, then throwing everything
// away if not. If it turns out that users often keep the
// structural delta but not the value it might be worth
// providing API control.
addInsert(pointer, partOf(newValue, start, length));
// Flush pending remove. Subsequent inserts are less likely
// to match it.
addRemove(null, null);
}
},
splitStructureEnd : function(pointer, count, start, length) {
insertedSplitStructures.put(pointer, {
start : start,
length : length,
elements : count
});
}
};
var remover = {
accept : function(pointer, start, length) {
if (pendingInsert !== null &&
pendingInsert.equalIgnoringIndexes(pointer) &&
isPartOf(pendingInsertValue, oldValue, start, length)) {
// Value to remove is equal to the pending inserted value
// and their pointers are sufficiently similar. Treat as a
// false positive and discard both.
pendingInsert = null;
pendingInsertValue = null;
} else {
// In contrast to the insert consumer, we copy byte ranges
// to avoid pinning the entire old byte array in memory.
addRemove(pointer, copyPartOf(oldValue, start, length));
// Flush pending insert. Subsequent removes are less likely
// to match it.
addInsert(null, null);
}
},
splitStructureEnd : function(pointer, count, start, length) {
removedSplitStructures.put(pointer, {
start : start,
length : length,
elements : count
});
}
};
}
function LastResult(splitStructures) {
var last;
var lastStart;
var lastLength;
this.accept = function (pointer, start, length) {
last = pointer;
lastStart = start;
lastLength = length;
};
this.foundLast = function () {
return !!last;
};
this.consumeLast = function (delegate) {
delegate.accept(last, lastStart, lastLength);
};
this.splitStructureEnd = function (pointer, count, start, length) {
splitStructures.put(pointer, {
start: start,
length: length,
elements: count
});
};
}
function ChangeMapImpl(parts) {
var entrySet = [];
var i = parts.iterator();
while (i.hasNext()) {
var n = i.next();
entrySet.push({
key : n.pointer.toString(),
value : n.value.get()
});
}
this.length = entrySet.length;
this.get = function(key) {
return parts.get(JSONPointer.parse(key)).get();
};
this.entrySet = function() {
return entrySet;
};
this.containsKey = function(key) {
return parts.contains(JSONPointer.parse(key));
};
this.descendants = function(pointer) {
return new ChangeMapImpl(parts.descendants(JSONPointer.parse(pointer)));
};
this.intersection = function(pointer) {
return new ChangeMapImpl(parts.intersection(JSONPointer.parse(pointer)));
};
}