taipa
Version:
Taiwanese morphological parsing library
173 lines • 8.42 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.tag = void 0;
const rules_1 = require("./rules");
const symbols_1 = require("./symbols");
const dictionary_1 = require("./dictionary");
const lemmatizer_1 = require("../unchange/lemmatizer");
function tag(features) {
// const pairs: Pairs<string, string> = [];
const pairs = [];
let expecting = '';
for (let i = 0; i < features.length; i++) {
if (rules_1.proceedingAdverbialParticles.long === features[i].token &&
dictionary_1.baseVerbs.includes(features[i].nextToken) &&
dictionary_1.subsidiariesA.includes(features[i].nextToken2)) {
pairs.push([features[i].token, symbols_1.Tagset.padv]);
continue;
}
if (rules_1.inflectedAdverbialParticles.includes(features[i].token)) {
pairs.push([features[i].token, symbols_1.Tagset.padv]);
continue;
}
if (rules_1.inflectedPhrasalVerbParticles.includes(features[i].token)) {
pairs.push([features[i].token, symbols_1.Tagset.ppv]);
continue;
}
if (dictionary_1.baseVerbs.includes(features[i].token) &&
pairs.length == 1 &&
rules_1.inflectedAdverbialParticles.includes(pairs[pairs.length - 1][0]) &&
dictionary_1.subsidiariesA.includes(features[i].nextToken)) {
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if (dictionary_1.baseVerbs.includes(features[i].token) &&
dictionary_1.auxiliaries.includes(features[i].prevToken)) {
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if (dictionary_1.baseVerbs.includes(features[i].token) &&
dictionary_1.subsidiariesA.includes(features[i].nextToken)) {
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if (dictionary_1.baseVerbs.includes(features[i].token) &&
dictionary_1.basePhrasalVerbParticles.includes(features[i].nextToken)) {
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if (dictionary_1.baseVerbs.includes(features[i].token) &&
rules_1.inflectedPhrasalVerbParticles.includes(features[i].prevToken) &&
rules_1.inflectedPhrasalVerbParticles.includes(features[i].prevToken2)) {
// a verb after a preceding phrasal verb
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if ((0, lemmatizer_1.lemmatize)(features[i].token).getLemmas().length == 3 &&
dictionary_1.baseVerbs.includes((0, lemmatizer_1.lemmatize)(features[i].token).getLemmas()[0].literal) &&
!dictionary_1.basePhrasalVerbParticles.includes(features[i].nextToken) && // object of the verb
(dictionary_1.basePhrasalVerbParticles.includes(features[i].nextToken2) ||
rules_1.inflectedPhrasalVerbParticles.includes(features[i].nextToken2))) {
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if ((dictionary_1.basePhrasalVerbParticles.includes(features[i].token) ||
rules_1.inflectedPhrasalVerbParticles.includes(features[i].token)) &&
features[i].prevToken &&
pairs[pairs.length - 1][1] === symbols_1.Tagset.nn &&
features[i].prevToken2 &&
pairs[pairs.length - 2][1] === symbols_1.Tagset.vb) {
// we may also check if the next token is a phrasal verb particle
pairs.push([features[i].token, symbols_1.Tagset.ppv]);
continue;
}
if (dictionary_1.basePhrasalVerbParticles.includes(features[i].token) &&
features[i].prevToken &&
pairs[pairs.length - 1][1] === symbols_1.Tagset.ppv &&
features[i].prevToken2 &&
pairs[pairs.length - 2][1] === symbols_1.Tagset.nn) {
pairs.push([features[i].token, symbols_1.Tagset.ppv]);
continue;
}
if (features[i].nextToken &&
(0, rules_1.isPhrasalVerbVp)(features[i].token, features[i].nextToken)) {
// is main verb of phrasal verb
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if (rules_1.inflectedPhrasalVerbParticles.includes(features[i].token) &&
rules_1.inflectedPhrasalVerbParticles.includes(features[i].prevToken)) {
pairs.push([features[i].token, symbols_1.Tagset.ppv]);
continue;
}
if (dictionary_1.subsidiariesA.includes(features[i].token) &&
dictionary_1.baseVerbs.includes(features[i].prevToken)) {
// to check the tone pattern. to check if last word
pairs.push([features[i].token, symbols_1.Tagset.psub]);
continue;
}
if (dictionary_1.subsidiariesA.includes(features[i].token) &&
dictionary_1.basePhrasalVerbParticles.includes(features[i].prevToken) &&
dictionary_1.baseVerbs.includes(features[i].prevToken2)) {
// to check the tone pattern. to check if last word
pairs.push([features[i].token, symbols_1.Tagset.psub]);
continue;
}
if (rules_1.inflectedPersonalPronouns.includes(features[i].token)) {
pairs.push([features[i].token, symbols_1.Tagset.npr]);
continue;
}
if (features[i].prevToken &&
(0, rules_1.isPhrasalVerbVp)(features[i].prevToken, features[i].token)) {
// is a particle of phrasal verb
pairs.push([features[i].token, symbols_1.Tagset.ppv]);
continue;
}
if (features[i].prevToken &&
features[i].prevToken2 &&
(0, rules_1.isPhrasalVerbVpp)(features[i].prevToken2, features[i].prevToken, features[i].token)) {
// is the 2nd particle of phrasal verb
pairs.push([features[i].token, symbols_1.Tagset.ppv]);
continue;
}
if (rules_1.inflectedVerbs.includes(features[i].token) &&
features[i].nextToken &&
rules_1.inflectedPhrasalVerbParticles.includes(features[i].nextToken)) {
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if (rules_1.inflectedPhrasalVerbParticles.includes(features[i].token) &&
features[i].nextToken &&
rules_1.inflectedPhrasalVerbParticles.includes(features[i].nextToken)) {
pairs.push([features[i].token, symbols_1.Tagset.ppv]);
continue;
}
if (rules_1.inflectedVerbs.includes(features[i].token) &&
features[i].nextToken &&
rules_1.inflectedPhrasalVerbParticles.includes(features[i].nextToken2)) {
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if (rules_1.inflectedPhrasalVerbParticles.includes(features[i].token) &&
features[i].prevToken &&
rules_1.inflectedVerbs.includes(features[i].prevToken)) {
pairs.push([features[i].token, symbols_1.Tagset.ppv]);
continue;
}
if (dictionary_1.demonstrativePronouns.includes(features[i].token)) {
pairs.push([features[i].token, symbols_1.Tagset.npr]);
continue;
}
if (dictionary_1.auxiliaries.includes(features[i].token)) {
pairs.push([features[i].token, symbols_1.Tagset.aux]);
continue;
}
if (dictionary_1.seperateVVCompounds.map(it => it[0]).includes(features[i].token)) {
// the first word of a VV compound
expecting = dictionary_1.seperateVVCompounds.filter(it => it[0] === features[i].token)[0][1];
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
if (dictionary_1.seperateVVCompounds.map(it => it[1]).includes(expecting) &&
features[i].token === expecting) {
// the second word of a VV compound
expecting = '';
pairs.push([features[i].token, symbols_1.Tagset.vb]);
continue;
}
pairs.push([features[i].token, symbols_1.Tagset.nn]);
}
return pairs;
}
exports.tag = tag;
//# sourceMappingURL=tagger.js.map