@hugsmidjan/htmldiff-js
Version:
JavaScript port of HtmlDiff.Net which is itself a C# port of HtmlDiff. Modified for reglugerd.is
177 lines (148 loc) • 5.46 kB
JavaScript
import Mode from './Mode';
import * as Utils from './Utils';
function convertHtmlToListOfWords(text, blockExpressions) {
let state = {
mode: Mode.character,
currentWord: [],
words: [],
};
let blockLocations = findBlocks(text, blockExpressions);
let isBlockCheckRequired = !!blockLocations.size;
let isGrouping = false;
let groupingUntil = -1;
for (let i = 0; i < text.length; i++) {
var character = text[i];
// Don't bother executing block checks if we don't have any blocks to check for!
if (isBlockCheckRequired) {
// Check if we have completed grouping a text sequence/block
if (groupingUntil === index) {
groupingUntil = -1;
isGrouping = false;
}
// Check if we need to group the next text sequence/block
let until = 0;
if (blockLocations.has(index)) {
until = blockLocations.get(index);
isGrouping = true;
groupingUntil = until;
}
// if we are grouping, then we don't care about what type of character we have, it's going to be treated as a word
if (isGrouping) {
state.currentWord.push(character);
state.mode = Mode.character;
continue;
}
}
switch (state.mode) {
case Mode.character:
if (Utils.isStartOfTag(character)) {
addClearWordSwitchMode(state, '<', Mode.tag);
} else if (Utils.isStartOfEntity(character)) {
addClearWordSwitchMode(state, character, Mode.entity);
} else if (Utils.isWhiteSpace(character)) {
addClearWordSwitchMode(state, character, Mode.whitespace);
} else if (
/[,.]/.test(character) &&
(Utils.isWhiteSpace(text[i + 1]) || text[i + 1] === undefined)
) {
// special case for [.,] before whitespace or end of text
addClearWordSwitchMode(state, character, Mode.character);
} else if (
Utils.isWord(character) &&
(state.currentWord.length === 0 ||
Utils.isWord(state.currentWord[state.currentWord.length - 1]))
) {
state.currentWord.push(character);
} else {
addClearWordSwitchMode(state, character, Mode.character);
}
break;
case Mode.tag:
if (Utils.isEndOfTag(character)) {
state.currentWord.push(character);
state.words.push(state.currentWord.join(''));
state.currentWord = [];
state.mode = Utils.isWhiteSpace(character) ? Mode.whitespace : Mode.character;
} else {
state.currentWord.push(character);
}
break;
case Mode.whitespace:
if (Utils.isStartOfTag(character)) {
addClearWordSwitchMode(state, character, Mode.tag);
} else if (Utils.isStartOfEntity(character)) {
addClearWordSwitchMode(state, character, Mode.entity);
} else if (Utils.isWhiteSpace(character)) {
state.currentWord.push(character);
} else {
addClearWordSwitchMode(state, character, Mode.character);
}
break;
case Mode.entity:
if (Utils.isStartOfTag(character)) {
addClearWordSwitchMode(state, character, Mode.tag);
} else if (Utils.isWhiteSpace(character)) {
addClearWordSwitchMode(state, character, Mode.whitespace);
} else if (Utils.isEndOfEntity(character)) {
let switchToNextMode = true;
if (state.currentWord.length !== 0) {
state.currentWord.push(character);
state.words.push(state.currentWord.join(''));
//join entity with last whitespace
if (
state.words.length > 2 &&
Utils.isWhiteSpace(state.words[state.words.length - 2]) &&
Utils.isWhiteSpace(state.words[state.words.length - 1])
) {
let w1 = state.words[state.words.length - 2];
let w2 = state.words[state.words.length - 1];
state.words.splice(state.words.length - 2, 2);
state.currentWord = [(w1 + w2).split()];
state.mode = Mode.whitespace;
switchToNextMode = false;
}
}
if (switchToNextMode) {
state.currentWord = [];
state.mode = Mode.character;
}
} else if (Utils.isWord(character)) {
state.currentWord.push(character);
} else {
addClearWordSwitchMode(state, character, Mode.character);
}
break;
}
}
if (state.currentWord.length !== 0) {
state.words.push(state.currentWord.join(''));
}
return state.words;
}
function addClearWordSwitchMode(state, character, mode) {
if (state.currentWord.length !== 0) {
state.words.push(state.currentWord.join(''));
}
state.currentWord = [character];
state.mode = mode;
}
function findBlocks(text, blockExpressions) {
let blockLocations = new Map();
if (blockExpressions === null) {
return blockLocations;
}
for (let exp of blockExpressions) {
let m;
while ((m = exp.exec(text)) !== null) {
if (blockLocations.has(m.index)) {
throw new Error(
'One or more block expressions result in a text sequence that overlaps. Current expression: ' +
exp.toString()
);
}
blockLocations.set(m.index, m.index + m[0].length);
}
}
return blockLocations;
}
export { convertHtmlToListOfWords };