compromise
Version:
natural language processing in the browser
55 lines (51 loc) • 1.55 kB
JavaScript
;
//turn xml special characters into apersand-encoding.
//i'm not sure this is perfectly safe.
const escapeHtml = (s) => {
const HTML_CHAR_MAP = {
'<': '<',
'>': '>',
'&': '&',
'"': '"',
'\'': ''',
' ': ' '
};
return s.replace(/[<>&"' ]/g, function(ch) {
return HTML_CHAR_MAP[ch];
});
};
//remove html elements already in the text
//not tested!
//http://stackoverflow.com/questions/295566/sanitize-rewrite-html-on-the-client-side
const sanitize = (html) => {
const tagBody = '(?:[^"\'>]|"[^"]*"|\'[^\']*\')*';
const tagOrComment = new RegExp(
'<(?:'
// Comment body.
+ '!--(?:(?:-*[^->])*--+|-?)'
// Special "raw text" elements whose content should be elided.
+ '|script\\b' + tagBody + '>[\\s\\S]*?</script\\s*'
+ '|style\\b' + tagBody + '>[\\s\\S]*?</style\\s*'
// Regular name
+ '|/?[a-z]'
+ tagBody
+ ')>',
'gi');
let oldHtml;
do {
oldHtml = html;
html = html.replace(tagOrComment, '');
} while (html !== oldHtml);
return html.replace(/</g, '<');
}
//turn the term into ~properly~ formatted html
const renderHtml = function(t) {
let classes = Object.keys(t.tag).filter((tag) => tag !== 'Term');
classes = classes.map(c => 'nlp' + c);
classes = classes.join(' ');
let text = sanitize(t.text)
text = escapeHtml(text);
let el = '<span class="' + classes + '">' + text + '</span>';
return escapeHtml(t.whitespace.before) + el + escapeHtml(t.whitespace.after)
};
module.exports = renderHtml;