UNPKG

academia

Version:

Tools for analyzing academic text

136 lines (133 loc) 5.41 kB
"use strict"; var types = require('../types'); var names = require('../names'); var name = '[A-Z][^()\\s]+(?: [IV]+)?'; var year = '[0-9]{4}(?:[-–—][0-9]{4})?[a-z]?'; var citeSources = [ // et al., duo, and single, with year in parens (name + "\\s+et\\s+al.\\s+\\(" + year + "\\)"), (name + "\\s+(?:and|&)\\s+" + name + "\\s+\\(" + year + "\\)"), (name + "\\s+\\(" + year + "\\)"), // et al., duo, and single, with year not in parens (note the commas) (name + "\\s+et\\s+al.,\\s+" + year + "\\b"), (name + "\\s+(?:and|&)\\s+" + name + ",\\s+" + year + "\\b"), (name + ",\\s+" + year + "\\b"), ]; exports.citeRegExp = new RegExp(citeSources.join('|'), 'g'); exports.yearRegExp = new RegExp(year); var citeCleanRegExp = new RegExp("[(),]|" + year, 'g'); /** find the start indices and lengths of all non-overlapping substrings matching `regExp` in `input`. */ function matchSpans(input, regExp) { if (regExp === void 0) { regExp = exports.citeRegExp; } // reset the regex regExp.lastIndex = 0; // set up the iteration variables var previousLastIndex = regExp.lastIndex; var spans = []; var match; while ((match = regExp.exec(input)) !== null) { spans.push([match.index, match[0].length]); } return spans; } exports.referenceRegExp = new RegExp("^(.+?)[.,]?\\s*\\(?(" + year + ")\\)?\\.\\s*(.+?)\\."); /** Given a string representing an individual reference in a bibliography, parse it into a Reference structure. */ function parseReference(reference) { var match = reference.match(exports.referenceRegExp); var authors = match ? names.parseNames(match[1]) : []; return { authors: authors, year: match ? match[2] : undefined, title: match ? match[3] : undefined, source: reference, }; } exports.parseReference = parseReference; /** Given a Reference, format it as a string. */ function formatReference(reference) { var authors = names.formatNames(reference.authors); var parts = [authors, reference.year, reference.title, reference.venue, reference.publisher, reference.pages]; return parts.filter(function (part) { return part !== undefined && part !== null; }).join('. ') + '.'; } exports.formatReference = formatReference; /** In-place modifies `cites` by setting the `reference` value of each one where a unique match from `references` is found. TODO: handle multiple matches somehow. */ function linkCites(cites, references) { cites.forEach(function (cite) { cite.references = references .map(function (reference, reference_i) { return ({ reference: reference, reference_i: reference_i }); }) .filter(function (_a) { var reference = _a.reference, reference_i = _a.reference_i; return names.authorsMatch(cite.authors, reference.authors) && (cite.year == reference.year); }) .map(function (_a) { var reference = _a.reference, reference_i = _a.reference_i; return ("/references/" + reference_i); }); }); } exports.linkCites = linkCites; /** Given the text of some part of a paper, extract the `Cite`s using regular expressions. */ function findCites(input, pointer) { return matchSpans(input, exports.citeRegExp).map(function (_a) { var offset = _a[0], length = _a[1]; var text = input.slice(offset, offset + length); var year_match = text.match(exports.yearRegExp); // we cull it down to just the names by removing parentheses, commas, // and years (with optional suffixes), and trimming any extra whitespace var names_string = text.replace(citeCleanRegExp, '').trim(); return { style: types.CiteStyle.Textual, text: text, origin: { pointer: pointer, offset: offset, length: length, }, authors: names.parseNames(names_string), year: year_match ? year_match[0] : null, references: [], }; }); } exports.findCites = findCites; /** Join the papers sections into a single string, for searching, and find all cites in that string. Parse references, and link the cites to them heuristically. Extend the given paper with the parsed references and cites (linked or not), and return it. */ function linkPaper(paper, referencesTitleRegExp) { if (referencesTitleRegExp === void 0) { referencesTitleRegExp = /References?/; } var sections = paper.sections; var body_sections = sections.filter(function (section) { return !referencesTitleRegExp.test(section.title); }); var references = sections .filter(function (section) { return referencesTitleRegExp.test(section.title); }) .map(function (section) { return section.paragraphs.map(parseReference); }) .reduce(function (accumulator, references) { accumulator.push.apply(accumulator, references); return accumulator; }, []); var cites = []; body_sections.forEach(function (section, section_i) { section.paragraphs.forEach(function (paragraph, paragraph_i) { cites.push.apply(cites, findCites(paragraph, "/sections/" + section_i + "/paragraphs/" + paragraph_i)); }); }); linkCites(cites, references); return { sections: sections, references: references, cites: cites }; } exports.linkPaper = linkPaper;