UNPKG

taipa

Version:

Taiwanese morphological parsing library

173 lines 8.42 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.tag = void 0; const rules_1 = require("./rules"); const symbols_1 = require("./symbols"); const dictionary_1 = require("./dictionary"); const lemmatizer_1 = require("../unchange/lemmatizer"); function tag(features) { // const pairs: Pairs<string, string> = []; const pairs = []; let expecting = ''; for (let i = 0; i < features.length; i++) { if (rules_1.proceedingAdverbialParticles.long === features[i].token && dictionary_1.baseVerbs.includes(features[i].nextToken) && dictionary_1.subsidiariesA.includes(features[i].nextToken2)) { pairs.push([features[i].token, symbols_1.Tagset.padv]); continue; } if (rules_1.inflectedAdverbialParticles.includes(features[i].token)) { pairs.push([features[i].token, symbols_1.Tagset.padv]); continue; } if (rules_1.inflectedPhrasalVerbParticles.includes(features[i].token)) { pairs.push([features[i].token, symbols_1.Tagset.ppv]); continue; } if (dictionary_1.baseVerbs.includes(features[i].token) && pairs.length == 1 && rules_1.inflectedAdverbialParticles.includes(pairs[pairs.length - 1][0]) && dictionary_1.subsidiariesA.includes(features[i].nextToken)) { pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if (dictionary_1.baseVerbs.includes(features[i].token) && dictionary_1.auxiliaries.includes(features[i].prevToken)) { pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if (dictionary_1.baseVerbs.includes(features[i].token) && dictionary_1.subsidiariesA.includes(features[i].nextToken)) { pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if (dictionary_1.baseVerbs.includes(features[i].token) && dictionary_1.basePhrasalVerbParticles.includes(features[i].nextToken)) { pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if (dictionary_1.baseVerbs.includes(features[i].token) && rules_1.inflectedPhrasalVerbParticles.includes(features[i].prevToken) && rules_1.inflectedPhrasalVerbParticles.includes(features[i].prevToken2)) { // a verb after a preceding phrasal verb pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if ((0, lemmatizer_1.lemmatize)(features[i].token).getLemmas().length == 3 && dictionary_1.baseVerbs.includes((0, lemmatizer_1.lemmatize)(features[i].token).getLemmas()[0].literal) && !dictionary_1.basePhrasalVerbParticles.includes(features[i].nextToken) && // object of the verb (dictionary_1.basePhrasalVerbParticles.includes(features[i].nextToken2) || rules_1.inflectedPhrasalVerbParticles.includes(features[i].nextToken2))) { pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if ((dictionary_1.basePhrasalVerbParticles.includes(features[i].token) || rules_1.inflectedPhrasalVerbParticles.includes(features[i].token)) && features[i].prevToken && pairs[pairs.length - 1][1] === symbols_1.Tagset.nn && features[i].prevToken2 && pairs[pairs.length - 2][1] === symbols_1.Tagset.vb) { // we may also check if the next token is a phrasal verb particle pairs.push([features[i].token, symbols_1.Tagset.ppv]); continue; } if (dictionary_1.basePhrasalVerbParticles.includes(features[i].token) && features[i].prevToken && pairs[pairs.length - 1][1] === symbols_1.Tagset.ppv && features[i].prevToken2 && pairs[pairs.length - 2][1] === symbols_1.Tagset.nn) { pairs.push([features[i].token, symbols_1.Tagset.ppv]); continue; } if (features[i].nextToken && (0, rules_1.isPhrasalVerbVp)(features[i].token, features[i].nextToken)) { // is main verb of phrasal verb pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if (rules_1.inflectedPhrasalVerbParticles.includes(features[i].token) && rules_1.inflectedPhrasalVerbParticles.includes(features[i].prevToken)) { pairs.push([features[i].token, symbols_1.Tagset.ppv]); continue; } if (dictionary_1.subsidiariesA.includes(features[i].token) && dictionary_1.baseVerbs.includes(features[i].prevToken)) { // to check the tone pattern. to check if last word pairs.push([features[i].token, symbols_1.Tagset.psub]); continue; } if (dictionary_1.subsidiariesA.includes(features[i].token) && dictionary_1.basePhrasalVerbParticles.includes(features[i].prevToken) && dictionary_1.baseVerbs.includes(features[i].prevToken2)) { // to check the tone pattern. to check if last word pairs.push([features[i].token, symbols_1.Tagset.psub]); continue; } if (rules_1.inflectedPersonalPronouns.includes(features[i].token)) { pairs.push([features[i].token, symbols_1.Tagset.npr]); continue; } if (features[i].prevToken && (0, rules_1.isPhrasalVerbVp)(features[i].prevToken, features[i].token)) { // is a particle of phrasal verb pairs.push([features[i].token, symbols_1.Tagset.ppv]); continue; } if (features[i].prevToken && features[i].prevToken2 && (0, rules_1.isPhrasalVerbVpp)(features[i].prevToken2, features[i].prevToken, features[i].token)) { // is the 2nd particle of phrasal verb pairs.push([features[i].token, symbols_1.Tagset.ppv]); continue; } if (rules_1.inflectedVerbs.includes(features[i].token) && features[i].nextToken && rules_1.inflectedPhrasalVerbParticles.includes(features[i].nextToken)) { pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if (rules_1.inflectedPhrasalVerbParticles.includes(features[i].token) && features[i].nextToken && rules_1.inflectedPhrasalVerbParticles.includes(features[i].nextToken)) { pairs.push([features[i].token, symbols_1.Tagset.ppv]); continue; } if (rules_1.inflectedVerbs.includes(features[i].token) && features[i].nextToken && rules_1.inflectedPhrasalVerbParticles.includes(features[i].nextToken2)) { pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if (rules_1.inflectedPhrasalVerbParticles.includes(features[i].token) && features[i].prevToken && rules_1.inflectedVerbs.includes(features[i].prevToken)) { pairs.push([features[i].token, symbols_1.Tagset.ppv]); continue; } if (dictionary_1.demonstrativePronouns.includes(features[i].token)) { pairs.push([features[i].token, symbols_1.Tagset.npr]); continue; } if (dictionary_1.auxiliaries.includes(features[i].token)) { pairs.push([features[i].token, symbols_1.Tagset.aux]); continue; } if (dictionary_1.seperateVVCompounds.map(it => it[0]).includes(features[i].token)) { // the first word of a VV compound expecting = dictionary_1.seperateVVCompounds.filter(it => it[0] === features[i].token)[0][1]; pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } if (dictionary_1.seperateVVCompounds.map(it => it[1]).includes(expecting) && features[i].token === expecting) { // the second word of a VV compound expecting = ''; pairs.push([features[i].token, symbols_1.Tagset.vb]); continue; } pairs.push([features[i].token, symbols_1.Tagset.nn]); } return pairs; } exports.tag = tag; //# sourceMappingURL=tagger.js.map