taipa
Version:
Taiwanese morphological parsing library
311 lines • 12.8 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.depParse = exports.getDepRelations = exports.getTokens = void 0;
const parser_1 = require("../dparser/parser");
const tagger_1 = require("../dparser/tagger");
const document_1 = require("../document");
const document_2 = require("../document");
const feature_1 = require("./feature");
const rules_1 = require("./rules");
const symbols_1 = require("./symbols");
const dictionary_1 = require("./dictionary");
const lemmatizer_1 = require("../unchange/lemmatizer");
const tonalres_1 = require("../tonal/tonalres");
const tone_1 = require("../tonal/tone");
const getTokens = function (text) {
const tokens = [];
if (text) {
const matchArr = text.match(/\w+/g);
if (matchArr) {
matchArr.filter((it) => it != undefined).map((it) => tokens.push(it));
}
}
return tokens;
};
exports.getTokens = getTokens;
const getDepRelations = function (nodes) {
const pa = new parser_1.DependencyParser();
return pa.parse(nodes);
};
exports.getDepRelations = getDepRelations;
function getFeatures(tokens) {
const features = [];
for (let i = 0; i < tokens.length; i++) {
features.push((0, feature_1.getFeature)(tokens[i], i, tokens));
}
return features;
}
/** Check if the word is in fourth tone or eighth tone. */
function isFourthEighthTone(token) {
const tone = (0, tone_1.extractTones)(token);
// no inflectional endings, not first tone which has no inflectional ending
// the fourth or eighth tone has a final
if (tone.getInflectionalEnding().length == 0 &&
(tone.getAllomorphicEnding().length == 1 ||
tone.getAllomorphicEnding().length == 2))
return true;
return false;
}
/** Check if the word is in fourth tone. */
function isFourthTone(token) {
const tone = (0, tone_1.extractTones)(token);
// no inflectional endings, not first tone which has no inflectional ending
// the fourth tone has a final of length 1
if (tone.getInflectionalEnding().length == 0 &&
tone.getAllomorphicEnding().length == 1)
return true;
return false;
}
function isFirstCheckedTone(token) {
const tone = (0, tone_1.extractTones)(token);
// a final plus a first tone letter
if (tone.getInflectionalEnding().length == 1 &&
tone.getInflectionalEnding() === tonalres_1.TonalLetterTags.f &&
tone.getAllomorphicEnding().length == 2)
return true;
return false;
}
function isSeventhTone(token) {
const tone = (0, tone_1.extractTones)(token);
// a seventh tone letter
if (tone.getInflectionalEnding().length == 1 &&
tone.getInflectionalEnding() === tonalres_1.TonalLetterTags.z)
return true;
return false;
}
function isThirdCheckedTone(token) {
const tone = (0, tone_1.extractTones)(token);
// a final plus a first tone letter
if (tone.getInflectionalEnding().length == 1 &&
tone.getInflectionalEnding() === tonalres_1.TonalLetterTags.w &&
tone.getAllomorphicEnding().length == 2)
return true;
return false;
}
/**
* Given a multi-word expression, which word should be uninflect to get lemmas.
* @param expression A multi-word expression
* @param position Position of a word in a sentence.
*/
function shouldUninflect(expression, position) {
if (position == expression.begin) {
// main verb
if (expression.distance > 0)
return true;
}
else if (position > expression.begin &&
position == expression.begin + 1 + expression.distance) {
// 1st particle or 2nd verb
if (isFourthTone(expression.tokens[1]))
return false;
if (isFirstCheckedTone(expression.tokens[1]))
return true;
if (isThirdCheckedTone(expression.tokens[1]))
return true;
}
else if (position > expression.begin && position == expression.end) {
// 2nd particle, if any
if (isSeventhTone(expression.tokens[2]))
return true;
if (isFirstCheckedTone(expression.tokens[2]))
return true;
}
return false;
}
/** Multi-Word Expression. */
class MultiWordExpression {
/** The begining of an expression in a sentence. */
begin = 0;
/** The end of an expression in a sentence. */
end = 0;
/** How far is the preceding word from the separated following words. */
distance = 0;
/** The constituents of an expression. */
tokens = [];
}
function createExpressionLengthTwo(begin, token1, token2) {
const obj = new MultiWordExpression();
obj.begin = begin;
obj.tokens.push(token1);
obj.tokens.push(token2);
return obj;
}
function getMultiWordExpressions(pairs) {
const expressions = [];
for (let i = 0; i < pairs.length - 1; i++) {
// phrasal verbs as verb + particle
if (pairs[i][1] === symbols_1.Tagset.vb &&
pairs[i + 1][1] === symbols_1.Tagset.ppv &&
((dictionary_1.baseVerbs.includes(pairs[i][0]) &&
dictionary_1.basePhrasalVerbParticles.includes(pairs[i + 1][0])) ||
(rules_1.inflectedVerbs.includes(pairs[i][0]) &&
rules_1.inflectedPhrasalVerbParticles.includes(pairs[i + 1][0])))) {
const expr = createExpressionLengthTwo(i, pairs[i][0], pairs[i + 1][0]);
expr.end = i + 1;
expressions.push(expr);
// look ahead for the 2nd particle of a phrasal verb
if (i + 2 < pairs.length && pairs[i + 2][1] === symbols_1.Tagset.ppv) {
// can further check if the 2nd particle is inflected
// push the 2nd particle into tokens array of the last element of the expressions
expressions[expressions.length - 1].tokens.push(pairs[i + 2][0]);
expressions[expressions.length - 1].end = i + 2;
}
}
}
for (let i = 0; i < pairs.length - 2; i++) {
if (pairs[i][1] === symbols_1.Tagset.vb && pairs[i + 1][1] === symbols_1.Tagset.nn) {
const exprs = [];
// separable transitive phrasal verb
// search in the remained tokens. check dictionary for a match
for (let k = i + 2; k < pairs.length; k++) {
if (pairs[k][1] === symbols_1.Tagset.ppv) {
const expr = createExpressionLengthTwo(i, pairs[i][0], pairs[k][0]);
expr.end = k;
exprs.push(expr);
// in the case of length 2, there will be 1 expression in exprs.
// in the case of length 3, there will be 2 expressions in exprs.
}
// look ahead until the end of the sentence
if (k + 1 == pairs.length && exprs.length == 1) {
// phrasal verb of length 2
const popped = exprs.shift();
if (popped) {
popped.distance = 1;
expressions.push(popped);
}
}
else if (k + 1 == pairs.length && exprs.length == 2) {
// the 2nd particle is popped out from the tokens array of the 2nd expression
const secondParticle = exprs[1].tokens.pop();
// phrasal verb of length 3
// main verb and 1st particle already in the array
// we then push the 2nd particle
if (secondParticle)
exprs[0].tokens.push(secondParticle);
// get the end of the 2nd expression
const end2nd = exprs[1].end;
const popped = exprs.shift();
if (popped) {
popped.distance = 1;
// assign the end
popped.end = end2nd;
expressions.push(popped);
}
}
}
}
else if (pairs[i][1] === symbols_1.Tagset.vb && pairs[i + 1][1] === symbols_1.Tagset.padv) {
for (let k = i + 2; k < pairs.length; k++) {
// look ahead
if (pairs[k][1] === symbols_1.Tagset.vb) {
// separate verb
const expr = createExpressionLengthTwo(i, pairs[i][0], pairs[k][0]);
expr.distance = 1;
expressions.push(expr);
}
else if (pairs[k][1] === symbols_1.Tagset.ppv) {
// separable phrasal verb
const expr = createExpressionLengthTwo(i, pairs[i][0], pairs[k][0]);
expr.distance = 1;
expressions.push(expr);
}
}
}
}
return expressions;
}
function getLemmas(pairs, expressions) {
// console.log(pairs, expressions);
const lemmas = [];
let ind = 0;
for (let i = 0; i < pairs.length; i++) {
if (expressions.length > 0 &&
expressions[ind] &&
i >= expressions[ind].begin &&
i <
expressions[ind].begin +
expressions[ind].distance +
expressions[ind].tokens.length) {
// when the multi-word expression is hit
if (expressions[ind].begin == i) {
// the begin of a multi-word expression
if (pairs[i][1] === symbols_1.Tagset.vb) {
// to match tone patterns
lemmas.push(isFourthEighthTone(pairs[i][0])
? ''
: (0, lemmatizer_1.lemmatize)(pairs[i][0]).getLemmas()[0].literal);
}
}
else if (i <
expressions[ind].begin +
expressions[ind].distance +
expressions[ind].tokens.length) {
// in the middle of a multi-word expression
if (pairs[i][1] === symbols_1.Tagset.padv)
lemmas.push((0, lemmatizer_1.lemmatize)(pairs[i][0]).getLemmas()[0].literal);
else if (pairs[i][1] === symbols_1.Tagset.vb)
lemmas.push('');
else if (pairs[i][1] === symbols_1.Tagset.nn)
lemmas.push(pairs[i][0]);
else if (pairs[i][1] === symbols_1.Tagset.ppv) {
if (shouldUninflect(expressions[ind], i)) {
if (isThirdCheckedTone(pairs[i][0]) ||
isFirstCheckedTone(pairs[i][0]) ||
isSeventhTone(pairs[i][0])) {
// if tiurhw, laiz, khihf, etc. 1, 3, 7 to 4.
lemmas.push((0, lemmatizer_1.lemmatizePhrasalVerbParticle)(pairs[i][0]).getLemmas()[0].literal);
}
else {
const lemma = (0, lemmatizer_1.lemmatize)(pairs[i][0]).getLemmas()[0].literal;
lemmas.push(lemma);
}
}
else
lemmas.push('');
}
if (i + 1 ==
expressions[ind].begin +
expressions[ind].distance +
expressions[ind].tokens.length) {
if (ind < expressions.length) {
// move indicator to the next expression
ind++;
}
}
}
}
else {
lemmas.push('');
}
}
// console.log(lemmas, lemmas.length);
return lemmas;
}
function convertTokensToNodes(pairs, lemmas) {
// convert token-tag pairs to nodes which are used as stack or queue elements
const nodes = pairs.map((it) => new document_2.Node(it[0]));
if (pairs) {
for (let i = 0; i < pairs.length; i++) {
if (nodes.length === pairs.length && pairs[i]) {
nodes[i].tag = pairs[i][1];
nodes[i].lemma = lemmas[i];
}
}
}
return nodes;
}
function depParse(text) {
const tokens = (0, exports.getTokens)(text);
const features = getFeatures(tokens);
const pairsTokenTag = (0, tagger_1.tag)(features);
const expressions = getMultiWordExpressions(pairsTokenTag);
const lemmas = getLemmas(pairsTokenTag, expressions);
const nodes = convertTokensToNodes(pairsTokenTag, lemmas);
const relations = (0, exports.getDepRelations)(nodes);
const doc = new document_1.Document();
doc.nodes = nodes;
doc.relations = relations;
return doc;
}
exports.depParse = depParse;
//# sourceMappingURL=processor.js.map