academia
Version:
Tools for analyzing academic text
181 lines (170 loc) • 6.35 kB
JavaScript
"use strict";
var lexing_1 = require('lexing');
/**
Given a name represented by a single string, parse it into first name, middle
name, and last name.
makeName(['Leonardo', 'da', 'Vinci']) -> { first: 'Leonardo', last: 'da Vinci' }
makeName(['Chris', 'Callison-Burch']) -> { first: 'Chris', last: 'Callison-Burch' }
makeName(['Hanna', 'M', 'Wallach']) -> { first: 'Hanna', middle: 'M', last: 'Wallach' }
makeName(['Zhou']) -> { last: 'Zhou' }
makeName(['McCallum', 'Andrew']) -> { first: 'Andrew', last: 'McCallum' }
TODO: handle 'van', 'von', 'da', etc.
*/
function parseName(parts) {
var n = parts.length;
if (n >= 3) {
return {
first: parts[0],
middle: parts.slice(1, n - 1).join(' '),
last: parts[n - 1],
};
}
else if (n == 2) {
return {
first: parts[0],
last: parts[1],
};
}
return {
last: parts[0]
};
}
exports.parseName = parseName;
/**
Opinionated name formatting.
*/
function formatName(name) {
return [name.first, name.middle, name.last].filter(function (part) { return part !== null && part !== undefined; }).join(' ');
}
exports.formatName = formatName;
function formatNames(names) {
var name_strings = names.map(formatName);
if (name_strings.length < 3) {
return name_strings.join(' and ');
}
// use the Oxford comma
var parts = name_strings.slice(0, -2); // might be []
parts.push(name_strings.slice(-2).join(', and '));
return parts.join(', ');
}
exports.formatNames = formatNames;
var default_rules = [
[/^$/, function (match) { return lexing_1.Token('EOF'); }],
[/^\s+/, function (match) { return null; }],
[/^,/, function (match) { return lexing_1.Token('SEPARATOR', match[0]); }],
[/^(and|et|&)/, function (match) { return lexing_1.Token('CONJUNCTION', match[0]); }],
[/^[A-Z](\.|\s)/, function (match) { return lexing_1.Token('INITIAL', match[0].trim()); }],
[/^((van|von|da|de)\s+)?[A-Z][^,\s]+(\s+[IVX]+\b)?/i, function (match) { return lexing_1.Token('NAME', match[0]); }],
// pretty much a catch-all:
[/^[^,\s]+/i, function (match) { return lexing_1.Token('NAME', match[0]); }],
];
/**
1. Typical list of 3+
'David Mimno, Hanna M Wallach, and Andrew McCallum' ->
['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
2. List of 3+ without the Oxford comma, in case that ever happens
'Aravind K Joshi, Ben King and Steven Abney' ->
['David Mimno', 'Hanna M Wallach', 'Andrew McCallum']
3. Duo
'Daniel Ramage and Chris Callison-Burch' ->
['David Mimno', 'Chris Callison-Burch']
4. Single author
'David Sankofl' ->
['David Sankofl']
5. Et al. abbreviation
'Zhao et al.' ->
['Zhao', 'al.']
TODO: handle last-name-first swaps, e.g.,
'Levy, R., & Daumé III, H.' -> 'R. Levy, H. Daumé III' -> ['R. Levy', 'H. Daumé III']
Or:
'Liu, F., Tian, F., & Zhu, Q.' -> 'F. Liu, F. Tian, & Q. Zhu' -> ['F. Liu', 'F. Tian', 'Q. Zhu']
Technically, this is ambiguous, since we could support lists of only last names
(e.g., 'Liu, Tian'; is this ['Tian Liu'] or ['Liu', 'Tian']?), but heuristics are better than nothing.
Example chunks:
[FIRST MIDDLE LAST] SEP
[FIRST LAST] SEP
[LAST SEP FIRST] SEP
[LAST SEP INITIAL] [LAST2 SEP INITIAL2]
*/
function parseNames(input) {
var input_iterable = new lexing_1.StringIterator(input);
var tokenizer = new lexing_1.Tokenizer(default_rules);
var token_iterator = tokenizer.map(input_iterable);
var names = [];
var buffer = [];
var buffer_swap = false;
function flush() {
if (buffer_swap) {
// move the first item to the last item
buffer.push(buffer.shift());
}
var name = parseName(buffer);
names.push(name);
// reset
buffer = [];
buffer_swap = false;
}
while (1) {
var token = token_iterator.next();
// console.error('%s=%s', token.name, token.value);
// tokens: EOF NAME INITIAL SEPARATOR CONJUNCTION
if (token.name === 'EOF') {
break;
}
else if (token.name === 'NAME') {
// the first long name after
if (buffer.length > 0 && buffer_swap) {
flush();
}
buffer.push(token.value);
}
else if (token.name === 'INITIAL') {
// console.log('INITIAL=%s', token.value);
buffer.push(token.value);
}
else if (token.name === 'SEPARATOR' || token.name === 'CONJUNCTION') {
if (buffer.length === 1) {
buffer_swap = true;
}
else if (buffer.length > 1) {
flush();
}
else {
}
}
}
// finish up
if (buffer.length > 0) {
flush();
}
return names;
}
exports.parseNames = parseNames;
/**
Typically, in-paper citations (`Cite`s) only have the last names of the authors,
while the `Reference`s in the Bibliography have full names, or at least first
initials and last names.
This method determines whether a `Cite`'s names match a `Reference`'s authors.
authorsMatch(['Joshi'], ['Aravind K Joshi']) -> true
authorsMatch(['Diab', 'Kamboj'], ['Mona Diab', 'Ankit Kamboj']) -> true
'et al.' gets special treatment. 'et al.' is a match if and only if there are
more reference authors beyond the one parallel to the 'et al.' citation author.
In other words, 'et al.' cannot stand in for a single author.
authorsMatch(['Blei', 'et al.'], ['David M Blei', 'Andrew Y Ng', 'Michael I Jordan']) -> true
*/
function authorsMatch(citeAuthors, referenceAuthors) {
for (var i = 0, l = Math.max(citeAuthors.length, referenceAuthors.length); i < l; i++) {
var citeAuthor = citeAuthors[i];
var referenceAuthor = referenceAuthors[i];
// the et al. handling has to precede the normal name-checking conditional below
if (citeAuthor && citeAuthor.last === 'al.' && referenceAuthors.length > (i + 1)) {
// early exit: ignore the rest of the reference authors
return true;
}
if (citeAuthor === undefined || referenceAuthor === undefined || citeAuthor.last !== referenceAuthor.last) {
return false;
}
}
return true;
}
exports.authorsMatch = authorsMatch;