UNPKG

taipa

Version:

Taiwanese morphological parsing library

311 lines 12.8 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.depParse = exports.getDepRelations = exports.getTokens = void 0; const parser_1 = require("../dparser/parser"); const tagger_1 = require("../dparser/tagger"); const document_1 = require("../document"); const document_2 = require("../document"); const feature_1 = require("./feature"); const rules_1 = require("./rules"); const symbols_1 = require("./symbols"); const dictionary_1 = require("./dictionary"); const lemmatizer_1 = require("../unchange/lemmatizer"); const tonalres_1 = require("../tonal/tonalres"); const tone_1 = require("../tonal/tone"); const getTokens = function (text) { const tokens = []; if (text) { const matchArr = text.match(/\w+/g); if (matchArr) { matchArr.filter((it) => it != undefined).map((it) => tokens.push(it)); } } return tokens; }; exports.getTokens = getTokens; const getDepRelations = function (nodes) { const pa = new parser_1.DependencyParser(); return pa.parse(nodes); }; exports.getDepRelations = getDepRelations; function getFeatures(tokens) { const features = []; for (let i = 0; i < tokens.length; i++) { features.push((0, feature_1.getFeature)(tokens[i], i, tokens)); } return features; } /** Check if the word is in fourth tone or eighth tone. */ function isFourthEighthTone(token) { const tone = (0, tone_1.extractTones)(token); // no inflectional endings, not first tone which has no inflectional ending // the fourth or eighth tone has a final if (tone.getInflectionalEnding().length == 0 && (tone.getAllomorphicEnding().length == 1 || tone.getAllomorphicEnding().length == 2)) return true; return false; } /** Check if the word is in fourth tone. */ function isFourthTone(token) { const tone = (0, tone_1.extractTones)(token); // no inflectional endings, not first tone which has no inflectional ending // the fourth tone has a final of length 1 if (tone.getInflectionalEnding().length == 0 && tone.getAllomorphicEnding().length == 1) return true; return false; } function isFirstCheckedTone(token) { const tone = (0, tone_1.extractTones)(token); // a final plus a first tone letter if (tone.getInflectionalEnding().length == 1 && tone.getInflectionalEnding() === tonalres_1.TonalLetterTags.f && tone.getAllomorphicEnding().length == 2) return true; return false; } function isSeventhTone(token) { const tone = (0, tone_1.extractTones)(token); // a seventh tone letter if (tone.getInflectionalEnding().length == 1 && tone.getInflectionalEnding() === tonalres_1.TonalLetterTags.z) return true; return false; } function isThirdCheckedTone(token) { const tone = (0, tone_1.extractTones)(token); // a final plus a first tone letter if (tone.getInflectionalEnding().length == 1 && tone.getInflectionalEnding() === tonalres_1.TonalLetterTags.w && tone.getAllomorphicEnding().length == 2) return true; return false; } /** * Given a multi-word expression, which word should be uninflect to get lemmas. * @param expression A multi-word expression * @param position Position of a word in a sentence. */ function shouldUninflect(expression, position) { if (position == expression.begin) { // main verb if (expression.distance > 0) return true; } else if (position > expression.begin && position == expression.begin + 1 + expression.distance) { // 1st particle or 2nd verb if (isFourthTone(expression.tokens[1])) return false; if (isFirstCheckedTone(expression.tokens[1])) return true; if (isThirdCheckedTone(expression.tokens[1])) return true; } else if (position > expression.begin && position == expression.end) { // 2nd particle, if any if (isSeventhTone(expression.tokens[2])) return true; if (isFirstCheckedTone(expression.tokens[2])) return true; } return false; } /** Multi-Word Expression. */ class MultiWordExpression { /** The begining of an expression in a sentence. */ begin = 0; /** The end of an expression in a sentence. */ end = 0; /** How far is the preceding word from the separated following words. */ distance = 0; /** The constituents of an expression. */ tokens = []; } function createExpressionLengthTwo(begin, token1, token2) { const obj = new MultiWordExpression(); obj.begin = begin; obj.tokens.push(token1); obj.tokens.push(token2); return obj; } function getMultiWordExpressions(pairs) { const expressions = []; for (let i = 0; i < pairs.length - 1; i++) { // phrasal verbs as verb + particle if (pairs[i][1] === symbols_1.Tagset.vb && pairs[i + 1][1] === symbols_1.Tagset.ppv && ((dictionary_1.baseVerbs.includes(pairs[i][0]) && dictionary_1.basePhrasalVerbParticles.includes(pairs[i + 1][0])) || (rules_1.inflectedVerbs.includes(pairs[i][0]) && rules_1.inflectedPhrasalVerbParticles.includes(pairs[i + 1][0])))) { const expr = createExpressionLengthTwo(i, pairs[i][0], pairs[i + 1][0]); expr.end = i + 1; expressions.push(expr); // look ahead for the 2nd particle of a phrasal verb if (i + 2 < pairs.length && pairs[i + 2][1] === symbols_1.Tagset.ppv) { // can further check if the 2nd particle is inflected // push the 2nd particle into tokens array of the last element of the expressions expressions[expressions.length - 1].tokens.push(pairs[i + 2][0]); expressions[expressions.length - 1].end = i + 2; } } } for (let i = 0; i < pairs.length - 2; i++) { if (pairs[i][1] === symbols_1.Tagset.vb && pairs[i + 1][1] === symbols_1.Tagset.nn) { const exprs = []; // separable transitive phrasal verb // search in the remained tokens. check dictionary for a match for (let k = i + 2; k < pairs.length; k++) { if (pairs[k][1] === symbols_1.Tagset.ppv) { const expr = createExpressionLengthTwo(i, pairs[i][0], pairs[k][0]); expr.end = k; exprs.push(expr); // in the case of length 2, there will be 1 expression in exprs. // in the case of length 3, there will be 2 expressions in exprs. } // look ahead until the end of the sentence if (k + 1 == pairs.length && exprs.length == 1) { // phrasal verb of length 2 const popped = exprs.shift(); if (popped) { popped.distance = 1; expressions.push(popped); } } else if (k + 1 == pairs.length && exprs.length == 2) { // the 2nd particle is popped out from the tokens array of the 2nd expression const secondParticle = exprs[1].tokens.pop(); // phrasal verb of length 3 // main verb and 1st particle already in the array // we then push the 2nd particle if (secondParticle) exprs[0].tokens.push(secondParticle); // get the end of the 2nd expression const end2nd = exprs[1].end; const popped = exprs.shift(); if (popped) { popped.distance = 1; // assign the end popped.end = end2nd; expressions.push(popped); } } } } else if (pairs[i][1] === symbols_1.Tagset.vb && pairs[i + 1][1] === symbols_1.Tagset.padv) { for (let k = i + 2; k < pairs.length; k++) { // look ahead if (pairs[k][1] === symbols_1.Tagset.vb) { // separate verb const expr = createExpressionLengthTwo(i, pairs[i][0], pairs[k][0]); expr.distance = 1; expressions.push(expr); } else if (pairs[k][1] === symbols_1.Tagset.ppv) { // separable phrasal verb const expr = createExpressionLengthTwo(i, pairs[i][0], pairs[k][0]); expr.distance = 1; expressions.push(expr); } } } } return expressions; } function getLemmas(pairs, expressions) { // console.log(pairs, expressions); const lemmas = []; let ind = 0; for (let i = 0; i < pairs.length; i++) { if (expressions.length > 0 && expressions[ind] && i >= expressions[ind].begin && i < expressions[ind].begin + expressions[ind].distance + expressions[ind].tokens.length) { // when the multi-word expression is hit if (expressions[ind].begin == i) { // the begin of a multi-word expression if (pairs[i][1] === symbols_1.Tagset.vb) { // to match tone patterns lemmas.push(isFourthEighthTone(pairs[i][0]) ? '' : (0, lemmatizer_1.lemmatize)(pairs[i][0]).getLemmas()[0].literal); } } else if (i < expressions[ind].begin + expressions[ind].distance + expressions[ind].tokens.length) { // in the middle of a multi-word expression if (pairs[i][1] === symbols_1.Tagset.padv) lemmas.push((0, lemmatizer_1.lemmatize)(pairs[i][0]).getLemmas()[0].literal); else if (pairs[i][1] === symbols_1.Tagset.vb) lemmas.push(''); else if (pairs[i][1] === symbols_1.Tagset.nn) lemmas.push(pairs[i][0]); else if (pairs[i][1] === symbols_1.Tagset.ppv) { if (shouldUninflect(expressions[ind], i)) { if (isThirdCheckedTone(pairs[i][0]) || isFirstCheckedTone(pairs[i][0]) || isSeventhTone(pairs[i][0])) { // if tiurhw, laiz, khihf, etc. 1, 3, 7 to 4. lemmas.push((0, lemmatizer_1.lemmatizePhrasalVerbParticle)(pairs[i][0]).getLemmas()[0].literal); } else { const lemma = (0, lemmatizer_1.lemmatize)(pairs[i][0]).getLemmas()[0].literal; lemmas.push(lemma); } } else lemmas.push(''); } if (i + 1 == expressions[ind].begin + expressions[ind].distance + expressions[ind].tokens.length) { if (ind < expressions.length) { // move indicator to the next expression ind++; } } } } else { lemmas.push(''); } } // console.log(lemmas, lemmas.length); return lemmas; } function convertTokensToNodes(pairs, lemmas) { // convert token-tag pairs to nodes which are used as stack or queue elements const nodes = pairs.map((it) => new document_2.Node(it[0])); if (pairs) { for (let i = 0; i < pairs.length; i++) { if (nodes.length === pairs.length && pairs[i]) { nodes[i].tag = pairs[i][1]; nodes[i].lemma = lemmas[i]; } } } return nodes; } function depParse(text) { const tokens = (0, exports.getTokens)(text); const features = getFeatures(tokens); const pairsTokenTag = (0, tagger_1.tag)(features); const expressions = getMultiWordExpressions(pairsTokenTag); const lemmas = getLemmas(pairsTokenTag, expressions); const nodes = convertTokensToNodes(pairsTokenTag, lemmas); const relations = (0, exports.getDepRelations)(nodes); const doc = new document_1.Document(); doc.nodes = nodes; doc.relations = relations; return doc; } exports.depParse = depParse; //# sourceMappingURL=processor.js.map