conllu-brat
Version:
conllu.js with brat
1,399 lines (1,193 loc) • 46.2 kB
JavaScript
// -*- Mode: JavaScript; tab-width: 4; indent-tabs-mode: nil; -*-
// vim:set ft=javascript ts=4 sw=4 sts=4 cindent:
/*
CoNLL-U format library for JavaScript.
Home: http://github.com/spyysalo/conllu.js
Format: http://universaldependencies.github.io/docs/format.html
Author: Sampo Pyysalo
License: MIT (http://opensource.org/licenses/MIT)
*/
const ConllU = (function() {
/*
* ConllU.Document: represents CoNLL-U document
*/
var Document = function() {
this.reset();
};
Document.prototype.reset = function() {
this.sentences = [];
this.error = false;
this.logger = function(s) { /* no-op */ };
this.strict = null; // pick heuristically
};
Document.prototype.log = function(message) {
this.logger(message);
};
Document.prototype.logError = function(message) {
this.log('error: ' + message);
this.error = true;
};
/* Parse CoNLL-U format, return Document.
* (see http://universaldependencies.github.io/docs/format.html)
*
* CoNLL-U files contain three types of lines:
* 1. Word lines
* 2. Blank lines marking sentence boundaries
* 3. Comment lines starting with a hash ("#")
*
* Each word line has the following format
* 1. ID: Word index, integer starting at 1 for each new sentence;
* may be a range for tokens with multiple words; may be a decimal
* number for empty nodes.
* 2. FORM: Word form or punctuation symbol.
* 3. LEMMA: Lemma or stem of word form.
* 4. UPOSTAG: Universal part-of-speech tag.
* 5. XPOSTAG: Language-specific part-of-speech tag; underscore
* if not available.
* 6. FEATS: List of morphological features from the Universal
* feature inventory or from a defined language-specific extension;
* underscore if not available.
* 7. HEAD: Head of the current token, which is either a value of ID
* or zero (0).
* 8. DEPREL: Universal Stanford dependency relation to the HEAD
* (root iff HEAD = 0) or a defined language-specific subtype
* of one.
* 9. DEPS: List of secondary dependencies (head-deprel pairs).
* 10. MISC: Any other annotation.
*/
Document.prototype.parse = function(input, logger, strict) {
// discard previous state, if any
this.reset();
if (logger !== undefined) {
this.logger = logger;
}
if (strict !== undefined) {
this.strict = strict;
}
// TODO: handle other newline formats
var lines = input.split('\n');
if (this.strict === null) {
this.strict = selectParsingMode(input, this.logger);
}
// select splitter to use for dividing the lines into fields.
var splitter = selectFieldSplitter(input, this.logger, this.strict);
var elements = [],
comments = [],
beforeSentence = true;
for (var idx=0; idx<lines.length; idx++) {
var line = lines[idx], that = this;
var logLineError = function(message) {
that.logError('line '+(idx+1)+': '+message+' ("'+line+'")');
that.error = true;
}
if (isComment(line)) {
if (beforeSentence) {
comments.push(line);
} else {
logLineError('comments must precede sentence, ignoring');
}
continue;
}
// non-comment, assume inside sentence until terminated by
// blank line
beforeSentence = false;
var fields = splitter(line);
if (fields.length === 0) {
// empty line, terminates sentence
if (elements.length !== 0) {
var sId = 'S' + (this.sentences.length+1);
var sentence = new Sentence(sId, elements, comments);
this.sentences.push(sentence);
} else {
logLineError('empty sentence, ignoring');
}
// reset
elements = [];
comments = [];
beforeSentence = true;
continue;
}
if (fields.length !== 10) {
logLineError('expected 10 fields, got '+fields.length);
repairFields(fields, this.logger);
}
var element = new Element(fields, idx, line);
var issues = element.validate();
for (var j=0; j<issues.length; j++) {
logLineError(issues[j]);
}
if (issues.length !== 0) {
if (!element.repair(this.logger)) {
logLineError('repair failed, discarding line');
continue; // failed, ignore line
}
}
elements.push(element);
}
// If elements is non-empty, last sentence ended without its
// expected terminating empty line. Process, but warn if strict.
if (elements.length !== 0) {
if (this.strict) {
this.logError('missing blank line after last sentence');
}
var sId = 'S' + (this.sentences.length+1);
var sentence = new Sentence(sId, elements, comments);
this.sentences.push(sentence);
// reset
elements = [];
comments = [];
beforeSentence = true;
}
// If comments is non-empty, there were comments after the
// terminating empty line. Warn and discard.
if (comments.length !== 0) {
this.logError('comments may not occur after last sentence, '+
'ignoring');
}
return this;
}
Document.prototype.toBrat = function(logger, includeEmpty) {
if (logger !== undefined) {
this.logger = logger;
}
if (includeEmpty === undefined) {
includeEmpty = false; // hide empty nodes by default
}
// merge brat data over all sentences
var mergedBratData = {},
textOffset = 0;
var categories = [
'entities',
'attributes',
'relations',
'comments',
'styles',
'sentlabels'
];
for (var i=0; i<categories.length; i++) {
mergedBratData[categories[i]] = [];
}
mergedBratData['text'] = '';
for (var i=0; i<this.sentences.length; i++) {
var sentence = this.sentences[i];
var issues = sentence.validate();
for (var j=0; j<issues.length; j++) {
this.logError(issues[j]);
}
if (issues.length !== 0) {
if (!sentence.repair(this.logger)) {
this.logError('repair failed, discarding sentence');
continue;
}
}
sentence.setBaseOffset(textOffset !== 0 ? textOffset + 1 : 0);
var bratData = sentence.toBrat(includeEmpty);
// merge
if (mergedBratData['text'].length !== 0) {
mergedBratData['text'] += '\n';
textOffset += 1;
}
mergedBratData['text'] += bratData['text'];
textOffset += bratData['text'].length;
for (var j=0; j<categories.length; j++) {
var c = categories[j];
mergedBratData[c] = mergedBratData[c].concat(bratData[c]);
}
}
// to avoid brat breakage on error, don't send empty text
if (mergedBratData['text'].length === 0) {
mergedBratData['text'] = '<EMPTY>';
}
mergedBratData['error'] = this.error;
return mergedBratData;
};
/*
* ConllU.Sentence: represents CoNLL-U sentence
*/
var Sentence = function(sentenceId, elements, comments) {
this.id = sentenceId;
this.elements = elements;
this.comments = comments;
this.baseOffset = 0;
};
// set offset of first character in sentence (for standoff
// generation)
Sentence.prototype.setBaseOffset = function(baseOffset) {
this.baseOffset = baseOffset;
}
Sentence.prototype.dependencies = function() {
var dependencies = [];
for (var i=0; i<this.elements.length; i++) {
var element = this.elements[i];
dependencies = dependencies.concat(element.dependencies());
}
return dependencies;
};
Sentence.prototype.words = function(includeEmpty) {
return this.elements.filter(function(e) {
return (e.isWord() || (includeEmpty && e.isEmptyNode()));
});
};
Sentence.prototype.multiwords = function() {
return this.elements.filter(function(e) {
return e.isMultiword();
});
};
Sentence.prototype.tokens = function() {
// extract token sequence by omitting word IDs that are
// included in a multiword token range.
var multiwords = this.multiwords();
var inRange = {};
for (var i=0; i<multiwords.length; i++) {
var mw = multiwords[i];
for (var j=mw.rangeFrom(); j<=mw.rangeTo(); j++) {
inRange[j] = true;
}
}
return this.elements.filter(function(e) {
return e.isToken(inRange);
});
};
// return words with possible modifications for visualization with
// brat
Sentence.prototype.bratWords = function(includeEmpty) {
var words = this.words(includeEmpty);
for (var i=0; i<words.length; i++) {
if (isRtl(words[i].form)) {
words[i] = deepCopy(words[i]);
words[i].form = rtlFix(words[i].form);
}
}
return words;
};
// return tokens with possible modifications for visualization
// with brat
Sentence.prototype.bratTokens = function() {
var tokens = this.tokens();
for (var i=0; i<tokens.length; i++) {
tokens[i] = deepCopy(tokens[i]);
tokens[i].form = rtlFix(tokens[i].form);
}
return tokens;
};
// return the text of the sentence for visualization with brat
Sentence.prototype.bratText = function(includeEmpty) {
var words = this.bratWords(includeEmpty);
var tokens = this.bratTokens();
var wordText = words.map(function(w) { return w.form }).join(' ');
var tokenText = tokens.map(function(w) { return w.form }).join(' ');
var combinedText = wordText;
if (wordText != tokenText) {
combinedText += '\n' + tokenText;
}
return combinedText;
};
// return the annotated text spans of the sentence for visualization
// with brat.
Sentence.prototype.bratSpans = function(includeEmpty) {
var spans = [],
offset = this.baseOffset;
// create an annotation for each word
var words = this.bratWords(includeEmpty);
for (var i=0; i<words.length; i++) {
var length = words[i].form.length;
spans.push([this.id+'-T'+words[i].id, words[i].upostag,
[[offset, offset+length]]]);
offset += length + 1;
}
return spans;
}
// return attributes of sentence annotations for visualization
// with brat.
Sentence.prototype.bratAttributes = function(includeEmpty) {
var words = this.words(includeEmpty);
// create attributes for word features
var attributes = [],
aidseq = 1;
for (var i=0; i<words.length; i++) {
var word = words[i],
tid = this.id+'-T'+word.id;
var nameVals = word.features();
for (var j=0; j<nameVals.length; j++) {
var name = nameVals[j][0],
value = nameVals[j][1];
attributes.push([this.id+'-A'+aidseq++, name, tid, value]);
}
}
return attributes;
};
// return relations for sentence dependencies for visualization
// with brat.
Sentence.prototype.bratRelations = function(includeEmpty) {
var dependencies = this.dependencies();
var relations = [];
for (var i=0; i<dependencies.length; i++) {
var dep = dependencies[i];
relations.push([this.id+'-R'+i, dep[2],
[ [ 'arg1', this.id+'-T'+dep[1] ],
[ 'arg2', this.id+'-T'+dep[0] ] ] ]);
}
return relations;
};
// return comments (notes) on sentence annotations for
// visualization with brat.
Sentence.prototype.bratComments = function(includeEmpty) {
var words = this.words(includeEmpty);
// TODO: better visualization for LEMMA, XPOSTAG, and MISC.
var comments = [];
for (var i=0; i<words.length; i++) {
var word = words[i],
tid = this.id+'-T'+word.id,
label = 'AnnotatorNotes';
comments.push([tid, label, 'Lemma: ' + word.lemma]);
if (word.xpostag !== '_') {
comments.push([tid, label, 'Xpostag: ' + word.xpostag]);
}
if (word.misc !== '_') {
comments.push([tid, label, 'Misc: ' + word.misc]);
}
}
return comments;
};
// Return styles on sentence annotations for visualization with
// brat. Note: this feature is an extension of both the CoNLL-U
// comment format and the basic brat data format.
Sentence.prototype.bratStyles = function(includeEmpty) {
var styles = [],
wildcards = [];
for (var i=0; i<this.comments.length; i++) {
var comment = this.comments[i];
var m = comment.match(/^(\#\s*visual-style\s+)(.*)/);
if (!m) {
continue;
}
var styleSpec = m[2];
// Attempt to parse as a visual style specification. The
// expected format is "REF<SPACE>STYLE", where REF
// is either a single ID (for a span), a space-separated
// ID1 ID2 TYPE triple (for a relation), or a special
// wildcard value like "arcs", and STYLE is either
// a colon-separated key-value pair or a color.
m = styleSpec.match(/^([^\t]+)\s+(\S+)\s*$/);
if (!m) {
// TODO: avoid console.log
console.log('warning: failed to parse: "'+comment+'"');
continue;
}
var reference = m[1], style = m[2];
// split style into key and value, adding a key to
// color-only styles as needed for the reference type.
var key, value;
m = style.match(/^(\S+):(\S+)$/);
if (m) {
key = m[1];
value = m[2];
} else {
value = style;
if (reference === 'arcs' || reference.indexOf(' ') !== -1) {
key = 'color';
} else {
key = 'bgColor';
}
}
// store wildcards for separate later processing
if (reference.match(/^(nodes|arcs)$/)) {
wildcards.push([reference, key, value]);
continue;
}
// adjust every ID in reference for brat
if (reference.indexOf(' ') === -1) {
reference = this.id + '-T' + reference;
} else {
reference = reference.split(' ');
reference[0] = this.id + '-T' + reference[0];
reference[1] = this.id + '-T' + reference[1];
}
styles.push([reference, key, value]);
}
// for expanding wildcards, first determine which words / arcs
// styles have already been set, and then add the style to
// everything that hasn't.
var setStyle = {};
for (var i=0; i<styles.length; i++) {
setStyle[styles[i][0].concat([styles[i][1]])] = true;
}
for (var i=0; i<wildcards.length; i++) {
var reference = wildcards[i][0],
key = wildcards[i][1],
value = wildcards[i][2];
if (reference === 'nodes') {
var words = this.words(includeEmpty);
for (var j=0; j<words.length; j++) {
var r = this.id + '-T' + words[j].id;
if (!setStyle[r.concat([key])]) {
styles.push([r, key, value]);
setStyle[r.concat([key])] = true;
}
}
} else if (reference === 'arcs') {
var deps = this.dependencies();
for (var j=0; j<deps.length; j++) {
var r = [this.id + '-T' + deps[j][1],
this.id + '-T' + deps[j][0],
deps[j][2]];
if (!setStyle[r.concat([key])]) {
styles.push([r, key, value]);
setStyle[r.concat([key])] = true;
}
}
} else {
console.log('internal error');
}
}
return styles;
};
// Return label of sentence for visualization with brat, or null
// if not defined. Note: this feature is an extension of both the
// CoNLL-U comment format and the basic brat data format.
Sentence.prototype.bratLabel = function() {
var label = null;
for (var i=0; i<this.comments.length; i++) {
var comment = this.comments[i];
var m = comment.match(/^(\#\s*sentence-label\b)(.*)/);
if (!m) {
continue;
}
label = m[2].trim();
}
return label;
};
// Return representation of sentence in brat embedded format (see
// http://brat.nlplab.org/embed.html).
// If includeEmpty is truthy, include empty nodes in the representation.
// Note: "styles" is an extension, not part of the basic format.
Sentence.prototype.toBrat = function(includeEmpty) {
var text = this.bratText(includeEmpty);
var spans = this.bratSpans(includeEmpty);
var attributes = this.bratAttributes(includeEmpty);
var relations = this.bratRelations(includeEmpty);
var comments = this.bratComments(includeEmpty);
var styles = this.bratStyles(includeEmpty);
var labels = [this.bratLabel()];
return {
'text': text,
'entities': spans,
'attributes': attributes,
'relations': relations,
'comments': comments,
'styles': styles,
'sentlabels': labels,
};
};
Sentence.prototype.elementById = function() {
var elementById = {};
for (var i=0; i<this.elements.length; i++) {
elementById[this.elements[i].id] = this.elements[i];
}
return elementById;
};
Sentence.prototype.addError = function(issue, element, issues) {
issues.push('line '+(element.lineidx+1)+': '+issue+' ("'+element.line+'")');
}
// Check validity of the sentence. Return list of strings
// representing issues found in validation (empty list if none).
Sentence.prototype.validate = function() {
var issues = [];
this.validateUniqueIds(issues);
this.validateWordSequence(issues);
this.validateMultiwordSequence(issues);
this.validateEmptyNodeSequence(issues);
this.validateReferences(issues);
return issues;
};
// Check for presence of ID duplicates
Sentence.prototype.validateUniqueIds = function(issues) {
issues = (issues !== undefined ? issues : []);
var initialIssueCount = issues.length;
var elementById = {};
for (var i=0; i<this.elements.length; i++) {
var element = this.elements[i];
if (elementById[element.id] !== undefined) {
this.addError('non-unique ID "'+element.id+'"',
element, issues);
}
elementById[element.id] = element;
}
return issues.length === initialIssueCount;
};
// Check validity of word ID sequence (should be 1,2,3,...)
Sentence.prototype.validateWordSequence = function(issues) {
issues = (issues !== undefined ? issues : []);
var initialIssueCount = issues.length;
var expectedId = 1;
for (var i=0; i<this.elements.length; i++) {
var element = this.elements[i];
if (element.isMultiword() || element.isEmptyNode()) {
continue; // only check simple word sequence here
}
if (parseInt(element.id, 10) !== expectedId) {
this.addError('word IDs should be 1,2,3,..., ' +
'expected '+expectedId+', got '+element.id,
element, issues);
}
expectedId = parseInt(element.id, 10) + 1;
}
return issues.length === initialIssueCount;
};
// Check that multiword token ranges are valid
Sentence.prototype.validateMultiwordSequence = function(issues) {
issues = (issues !== undefined ? issues : []);
var initialIssueCount = issues.length;
var expectedId = 1;
for (var i=0; i<this.elements.length; i++) {
var element = this.elements[i];
if (element.isMultiword() && element.rangeFrom() !== expectedId) {
this.addError('multiword tokens must appear before '+
'first word in their range',
element, issues);
} else {
expectedId = parseInt(element.id, 10) + 1;
}
}
return issues.length === initialIssueCount;
};
Sentence.prototype.validateEmptyNodeSequence = function(issues) {
issues = (issues !== undefined ? issues : []);
var initialIssueCount = issues.length;
var previousWordId = '0'; // TODO check https://github.com/UniversalDependencies/docs/issues/382
var nextEmptyNodeId = 1;
for (var i=0; i<this.elements.length; i++) {
var element = this.elements[i];
if (element.isWord()) {
previousWordId = element.id;
nextEmptyNodeId = 1;
} else if (element.isEmptyNode()) {
var expectedId = previousWordId + '.' + nextEmptyNodeId;
if (element.id !== expectedId) {
this.addError('empty node IDs should be *.1, *.2, ... ' +
'expected '+expectedId+', got '+element.id,
element, issues);
}
nextEmptyNodeId++;
}
}
return issues.length === initialIssueCount;
}
// Check validity of ID references in HEAD and DEPS.
Sentence.prototype.validateReferences = function(issues) {
issues = (issues !== undefined ? issues : []);
var initialIssueCount = issues.length;
var elementById = this.elementById();
for (var i=0; i<this.elements.length; i++) {
var element = this.elements[i];
// validate HEAD
if (!element.validHeadReference(elementById)) {
this.addError('HEAD is not valid ID: "'+element.head+'"',
element, issues);
}
// validate DEPS
var elemDeps = element.dependencies(true);
for (var j=0; j<elemDeps.length; j++) {
var head = elemDeps[j][1];
if (head !== '0' && elementById[head] === undefined) {
this.addError('invalid ID "'+head+'" in DEPS',
element, issues);
}
}
}
return issues.length === initialIssueCount;
};
Sentence.prototype.repair = function(log) {
log = (log !== undefined ? log : nullLogger);
if (!this.validateUniqueIds()) {
this.repairUniqueIds(log);
}
if (!this.validateWordSequence()) {
this.repairWordSequence(log);
}
if (!this.validateMultiwordSequence()) {
this.repairMultiwordSequence(log);
}
if (!this.validateEmptyNodeSequence()) {
this.repairEmptyNodeSequence(log);
}
if (!this.validateReferences()) {
this.repairReferences(log);
}
var issues = this.validate();
return issues.length === 0;
};
Sentence.prototype.repairUniqueIds = function(log) {
log = (log !== undefined ? log : nullLogger);
var elementById = {},
filtered = [];
for (var i=0; i<this.elements.length; i++) {
var element = this.elements[i];
if (elementById[element.id] === undefined) {
elementById[element.id] = element;
filtered.push(element);
} else {
log('repair: remove element with duplicate ID "'+element.id+'"');
}
}
this.elements = filtered;
return true;
};
Sentence.prototype.repairWordSequence = function(log) {
log('TODO: implement ConllU.Sentence.repairWordSequence()');
return true;
};
Sentence.prototype.repairMultiwordSequence = function(log) {
log('TODO: implement ConllU.Sentence.repairMultiwordSequence()');
return true;
};
Sentence.prototype.repairEmptyNodeSequence = function(log) {
log('TODO: implement ConllU.Sentence.repairEmptyNodeSequence()');
return true;
};
Sentence.prototype.repairReferences = function(log) {
log = (log !== undefined ? log : nullLogger);
var elementById = this.elementById();
for (var i=0; i<this.elements.length; i++) {
var element = this.elements[i];
// repair HEAD if not valid
if (!element.validHeadReference(elementById)) {
log('repair: blanking invalid HEAD');
element.head = null;
}
// repair DEPS if not valid
if (element.deps === '_') {
continue;
}
var deparr = element.deps.split('|'),
filtered = [];
for (var j=0; j<deparr.length; j++) {
var dep = deparr[j];
var m = dep.match(dependencyRegex);
if (m) {
var head = m[1], deprel = m[2];
if (head === '0' || elementById[head] !== undefined) {
filtered.push(dep);
} else {
log('repair: removing invalid ID from DEPS');
error = true;
}
} else {
console.log('internal error: repairReferences(): ' +
'invalid DEPS');
}
}
if (filtered.length === 0) {
element.deps = '_';
} else {
element.deps = filtered.join('|');
}
}
return true;
};
/*
* ConllU.Element: represents CoNLL-U word or multiword token
*/
// represents CoNLL-U word or multiword token
var Element = function(fields, lineidx, line) {
this.id = fields[0];
this.form = fields[1];
this.lemma = fields[2];
this.upostag = fields[3];
this.xpostag = fields[4];
this.feats = fields[5];
this.head = fields[6];
this.deprel = fields[7];
this.deps = fields[8];
this.misc = fields[9];
this.lineidx = lineidx;
this.line = line;
};
// constraints that hold for all fields
Element.prototype.validateField = function(field, name, issues,
allowSpace) {
name = (name !== undefined ? name : 'field');
issues = (issues !== undefined ? issues : []);
if (allowSpace === undefined) {
allowSpace = false;
}
if (field === undefined) {
issues.push('invalid '+name);
return false;
} else if (field.length === 0) {
issues.push(name+' must not be empty: "'+field+'"');
return false;
} else if (hasSpace(field) && !allowSpace) {
issues.push(name+' must not contain space: "'+field+'"');
return false;
} else {
return true;
}
};
Element.prototype.validateId = function(id, issues) {
issues = (issues !== undefined ? issues : []);
if (!this.validateField(id, 'ID', issues)) {
return false;
} else if (id.match(/^\d+$/)) {
if (id === '0') {
issues.push('ID indices must start from 1: "'+id+'"');
return false;
} else {
return true;
}
} else if (id.match(/^(\d+)-(\d+)$/)) {
var m = id.match(/^(\d+)-(\d+)$/);
if (!m) {
console.log('internal error');
return false;
}
var start = parseInt(m[1], 10),
end = parseInt(m[2], 10);
if (end < start) {
issues.push('ID ranges must have start <= end: "'+id+'"');
return false;
} else {
return true;
}
} else if (id.match(/^(\d+)\.(\d+)$/)) {
var m = id.match(/^(\d+)\.(\d+)$/);
if (!m) {
console.log('internal error');
return false;
}
var iPart = parseInt(m[1], 10),
fPart = parseInt(m[2], 10);
if (iPart == 0 || fPart == 0) {
issues.push('ID indices must start from 1: "'+id+'"');
return false;
} else {
return true;
}
} else {
issues.push('ID must be integer, range, or decimal: "'+id+'"');
return false;
}
};
Element.prototype.validateForm = function(form, issues) {
issues = (issues !== undefined ? issues : []);
if (!this.validateField(form, 'FORM', issues, true)) {
return false;
} else {
return true;
}
};
Element.prototype.validateLemma = function(lemma, issues) {
issues = (issues !== undefined ? issues : []);
if (!this.validateField(lemma, 'LEMMA', issues, true)) {
return false;
} else {
return true;
}
};
Element.prototype.validateUpostag = function(upostag, issues) {
issues = (issues !== undefined ? issues : []);
if (!this.validateField(upostag, 'UPOSTAG', issues)) {
return false;
} else {
return true;
}
};
Element.prototype.validateXpostag = function(xpostag, issues) {
issues = (issues !== undefined ? issues : []);
if (!this.validateField(xpostag, 'XPOSTAG', issues)) {
return false;
} else {
return true;
}
};
Element.prototype.validateFeats = function(feats, issues) {
issues = (issues !== undefined ? issues : []);
if (!this.validateField(feats, 'FEATS', issues)) {
return false;
} else if (feats === '_') {
return true;
}
var initialIssueCount = issues.length;
var featarr = feats.split('|');
var featmap = {};
var prevName = null;
for (var i=0; i<featarr.length; i++) {
var feat = featarr[i];
var m = feat.match(featureRegex);
if (!m) {
// TODO more descriptive issue
issues.push('invalid FEATS entry: "'+feat+'"');
continue;
}
var name = m[1], valuestr = m[2];
if (prevName !== null &&
name.toLowerCase() < prevName.toLowerCase()) {
issues.push('features must be ordered alphabetically ' +
'(case-insensitive): "'+name+'" < "'+prevName+'"');
noIssue = false;
}
prevName = name;
var values = valuestr.split(',');
var valuemap = {}, validValues = [];
for (var j=0; j<values.length; j++) {
var value = values[j];
var m = value.match(featureValueRegex);
if (!m) {
issues.push('invalid FEATS value: "'+value+'"');
continue;
}
if (valuemap[value] !== undefined) {
issues.push('duplicate feature value: "'+value+'"');
continue;
}
valuemap[value] = true;
validValues.push(value);
}
if (featmap[name] !== undefined) {
issues.push('duplicate feature name: "'+name+'"');
continue;
}
if (validValues.length !== 0) {
featmap[name] = validValues;
}
}
return issues.length === initialIssueCount;
};
Element.prototype.validateHead = function(head, issues) {
issues = (issues !== undefined ? issues : []);
// TODO: consider checking that DEPREL is "root" iff HEAD is 0.
if (head === null) {
return true; // exceptional case for Element.repair()
} else if (!this.validateField(head, 'HEAD', issues)) {
return false;
} else if (this.isEmptyNode() && head === '_') {
return true; // underscore permitted for empty nodes.
} else if (!head.match(/^\d+$/)) {
issues.push('HEAD must be an ID or zero: "'+head+'"');
return false;
} else {
return true;
}
};
Element.prototype.validateDeprel = function(deprel, issues) {
issues = (issues !== undefined ? issues : []);
if (!this.validateField(deprel, 'DEPREL', issues)) {
return false;
} else {
return true;
}
};
Element.prototype.validateDeps = function(deps, issues) {
issues = (issues !== undefined ? issues : []);
// TODO: consider checking that deprel is "root" iff head is 0.
if (!this.validateField(deps, 'DEPS', issues)) {
return false;
} else if (deps === '_') {
return true;
}
var deparr = deps.split('|');
var prevHead = null;
// TODO: don't short-circuit on first error
for (var i=0; i<deparr.length; i++) {
var dep = deparr[i];
var m = dep.match(/^(\d+(?:\.\d+)?):(\S+)$/);
if (!m) {
// TODO more descriptive issue
issues.push('invalid DEPS: "'+deps+'"');
return false;
}
var head = m[1], deprel = m[2];
if (prevHead !== null &&
parseFloat(head) < parseFloat(prevHead)) {
issues.push('DEPS must be ordered by head index');
return false;
}
prevHead = head;
}
return true;
};
Element.prototype.validateMisc = function(misc, issues) {
issues = (issues !== undefined ? issues : []);
if (!this.validateField(misc, 'MISC', issues)) {
return false;
} else {
return true;
}
};
Element.prototype.validHeadReference = function(elementById) {
return (this.head === '_' || this.head === null || this.head === '0' ||
elementById[this.head] !== undefined);
};
Element.prototype.isWord = function() {
// word iff ID is an integer
return !!this.id.match(/^\d+$/);
};
Element.prototype.isMultiword = function() {
return !!this.id.match(/^\d+-\d+$/);
};
Element.prototype.isEmptyNode = function() {
return !!this.id.match(/^\d+\.\d+$/);
};
Element.prototype.rangeFrom = function() {
return parseInt(this.id.match(/^(\d+)-\d+$/)[1], 10);
};
Element.prototype.rangeTo = function() {
return parseInt(this.id.match(/^\d+-(\d+)$/)[1], 10);
};
Element.prototype.isToken = function(inRange) {
// token iff multiword or not included in a multiword range
return this.isMultiword() || !inRange[this.id];
};
// return list of (DEPENDENT, HEAD, DEPREL) lists
Element.prototype.dependencies = function(skipHead) {
skipHead = (skipHead !== undefined ? skipHead : false);
var elemDeps = [];
if (!skipHead && this.head !== '_' && this.head !== null) {
elemDeps.push([this.id, this.head, this.deprel]);
}
if (this.deps != '_') {
var deparr = this.deps.split('|');
for (var i=0; i<deparr.length; i++) {
var dep = deparr[i];
var m = dep.match(dependencyRegex);
if (m) {
elemDeps.push([this.id, m[1], m[2]]);
} else {
console.log('internal error: dependencies(): invalid DEPS',
this.deps);
}
}
}
return elemDeps;
}
// return list of (name, value) pairs
Element.prototype.features = function() {
var nameVals = [];
if (this.feats === '_') {
return [];
}
var featarr = this.feats.split('|');
for (var i=0; i<featarr.length; i++) {
var feat = featarr[i];
var m = feat.match(featureRegex);
if (!m) {
continue;
}
var name = m[1], valuestr = m[2];
var values = valuestr.split(',');
for (var j=0; j<values.length; j++) {
var value = values[j];
var m = value.match(featureValueRegex);
if (!m) {
continue;
}
nameVals.push([name, value]);
}
}
return nameVals;
};
// Check validity of the element. Return list of strings
// representing issues found in validation (empty list if none).
Element.prototype.validate = function() {
var issues = [];
this.validateId(this.id, issues);
this.validateForm(this.form, issues);
// multiword tokens (elements with range IDs) are (locally) valid
// iff all remaining fields (3-10) contain just an underscore.
if (this.isMultiword()) {
if (this.lemma != '_' ||
this.upostag != '_' ||
this.xpostag != '_' ||
this.feats != '_' ||
this.head != '_' ||
this.deprel != '_' ||
this.deps != '_' ||
this.misc != '_') {
issues.push('non-underscore field for multiword token');
}
return issues;
}
// if we're here, not a multiword token.
this.validateLemma(this.lemma, issues);
this.validateUpostag(this.upostag, issues);
this.validateXpostag(this.xpostag, issues);
this.validateFeats(this.feats, issues);
this.validateHead(this.head, issues);
this.validateDeprel(this.deprel, issues);
this.validateDeps(this.deps, issues);
this.validateMisc(this.misc, issues);
return issues;
};
// Attempt to repair a non-valid element. Return true iff the
// element is valid following repair, false otherwise.
Element.prototype.repair = function(log) {
log = (log !== undefined ? log : nullLogger);
if (!this.validateId(this.id)) {
return false; // can't be helped
}
if (!this.validateForm(this.form)) {
log('repair: blanking invalid FORM');
this.form = '<ERROR>';
}
if (this.isMultiword()) {
// valid as long as everything is blank
this.lemma = '_';
this.upostag = '_';
this.xpostag = '_';
this.feats = '_';
this.head = '_';
this.deprel = '_';
this.deps = '_';
this.misc = '_';
return true;
}
// if we're here, not a multiword token.
if(!this.validateLemma(this.lemma)) {
log('repair: blanking invalid LEMMA');
this.lemma = '<ERROR>';
}
if(!this.validateUpostag(this.upostag)) {
log('repair: blanking invalid UPOSTAG');
this.upostag = '_'; // TODO: not valid
}
if(!this.validateXpostag(this.xpostag)) {
log('repair: blanking invalid XPOSTAG');
this.xpostag = '_';
}
if(!this.validateFeats(this.feats)) {
log('repair: blanking invalid FEATS');
this.feats = '_';
}
if(!this.validateHead(this.head)) {
log('repair: blanking invalid HEAD');
this.head = null; // note: exceptional case
}
if(!this.validateDeprel(this.deprel)) {
log('repair: blanking invalid DEPREL');
this.deprel = '_'; // TODO: not valid
}
if(!this.validateDeps(this.deps)) {
log('repair: blanking invalid DEPS');
this.deps = '_';
}
if(!this.validateMisc(this.misc)) {
log('repair: blanking invalid MISC');
this.misc = '_';
}
var issues = this.validate();
return issues.length === 0;
};
/*
* Miscellaneous support functions.
*/
var repairFields = function(fields, logger) {
if (logger === undefined) {
logger = nullLogger;
}
if (fields.length > 10) {
logger('repair: discarding fields > 10');
fields = fields.slice(0, 10);
} else {
logger('repair: filling in empty ("_") for missing fields');
for (var m=0; m<10-fields.length; m++) {
fields.push('_');
}
}
};
var strictFieldSplitter = function(line) {
// strict CoNLL format parsing: only split on TAB, no extra space.
if (line.length === 0) {
return [];
} else {
return line.split('\t');
}
}
var looseFieldSplitter = function(line) {
// loose CoNLL format parsing: split on any space sequence, trim
// surrounding space.
line = line.trim();
if (line.length === 0) {
return [];
} else {
return line.split(/\s+/);
}
}
var selectParsingMode = function(conll, log) {
// return whether to use strict mode parsing
// very simple heuristic: any TABs in the input trigger
// strict parsing, loose only if none present.
if (conll.indexOf('\t') !== -1) {
log('note: TAB found, parsing CoNLL-U in strict mode.')
return true;
} else {
log('note: no TAB found, parsing CoNLL-U in loose mode.')
return false;
}
};
var selectFieldSplitter = function(conll, log, strict) {
// return function to use for dividing lines into fields.
if (strict) {
return strictFieldSplitter;
} else {
return looseFieldSplitter;
}
};
var isComment = function(line) {
return line.length !== 0 && line[0] === '#';
};
var hasSpace = function(s) {
return !!s.match(/\s/);
};
var nullLogger = function(message) {
return null;
}
/*
* Return true iff given string only contains characters from a
* right-to-left Unicode block and is not empty.
*/
var isRtl = function(s) {
// range from http://stackoverflow.com/a/14824756
return !!s.match(/^[\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC]+$/);
};
/*
* Return given token with possible modifications to accommodate
* issues in brat rendering of right-to-left text
* (https://github.com/UniversalDependencies/docs/issues/52)
*/
var rtlFix = function(s) {
var prefix = '\u02D1',
suffix = '\u02D1';
if (isRtl(s)) {
var s = prefix + s + suffix;
}
return s;
};
/*
* Return a deep copy of the given object. Note: not particularly
* efficient, and all fields must be serializable for this to work
* correctly.
*/
var deepCopy = function(o) {
return JSON.parse(JSON.stringify(o));
};
/*
* Regular expressions for various parts of the format.
* See https://github.com/UniversalDependencies/docs/issues/33
*/
// match single (feature, value[s]) pair in FEATS
var featureRegex = /^([A-Z0-9][a-zA-Z0-9]*(?:\[[a-z0-9]+\])?)=([A-Z0-9][a-zA-Z0-9]*(?:,[A-Z0-9][a-zA-Z0-9]*)*)$/;
// match single feature value in FEATS
var featureValueRegex = /^[A-Z0-9][a-zA-Z0-9]*$/;
// match single (head, deprel) pair in DEPS
var dependencyRegex = /^(\d+(?:\.\d+)?):(.*)$/;
return {
Document: Document,
Sentence: Sentence,
Element: Element,
};
})();
export default {
Document: ConllU.Document,
Sentence: ConllU.Sentence,
Element: ConllU.Element
}