@bntk/pos
Version:
Build applications with Bengali natural language processing tools.
176 lines (175 loc) • 6.64 kB
TypeScript
/**
* Part-of-Speech (POS) tagging and analysis functions for Bangla text
*/
/**
* Universal POS tags based on Universal Dependencies (UD) v2 specification
* Source: https://universaldependencies.org/u/pos/
*/
export declare const enum UniversalPOSTag {
/** Adjectives are words that typically modify nouns
* @example বড়/boro, সুন্দর/sundor, নতুন/notun */
ADJ = "ADJ",
/** Adpositions are prepositions and postpositions
* @example মধ্যে/moddhe, থেকে/theke, উপরে/upore */
ADP = "ADP",
/** Adverbs are words that typically modify verbs, adjectives or other adverbs
* @example খুব/khub, ধীরে/dhire, ভালোভাবে/bhalobhabe */
ADV = "ADV",
/** Auxiliary verbs are used to form tenses, moods, etc.
* @example আছে/ache, হয়/hoi, হবে/hobe */
AUX = "AUX",
/** Coordinating conjunctions connect words, phrases, clauses of equal status
* @example এবং/ebong, কিন্তু/kintu, অথবা/othoba */
CCONJ = "CCONJ",
/** Determiners are words that modify nouns or noun phrases
* @example এই/ei, সেই/sei, কোন/kon */
DET = "DET",
/** Interjections are exclamatory words
* @example ওহ/oh, বাহ/bah, হায়/hay */
INTJ = "INTJ",
/** Nouns are words denoting all physical objects and materials
* @example বই/boi, মানুষ/manush, বাড়ি/bari */
NOUN = "NOUN",
/** Numerals represent numbers, quantities, etc.
* @example এক/ek, দুই/dui, প্রথম/prothom */
NUM = "NUM",
/** Particles are function words that must be associated with another word
* @example না/na, তো/to, কি/ki */
PART = "PART",
/** Pronouns substitute for nouns or noun phrases
* @example আমি/ami, তুমি/tumi, সে/se */
PRON = "PRON",
/** Proper nouns are names of specific persons, places, organizations
* @example ঢাকা/dhaka, রবীন্দ্রনাথ/robindronath, বাংলাদেশ/bangladesh */
PROPN = "PROPN",
/** Punctuation marks
* @example ।, ?, !, , */
PUNCT = "PUNCT",
/** Subordinating conjunctions link dependent clauses to independent ones
* @example যদি/jodi, কারণ/karon, যখন/jokhon */
SCONJ = "SCONJ",
/** Symbols represent currency, math operators, etc.
* @example ৳, +, = */
SYM = "SYM",
/** Verbs denote actions and processes
* @example যাই/jai, খাই/khai, পড়ি/pori */
VERB = "VERB",
/** Other words that don't fit into above categories
* @example ইত্যাদি/ittyadi, প্রভৃতি/probhriti */
X = "X"
}
/**
* Represents a word with its POS tag and additional linguistic features
*/
export interface TaggedWord {
/** The original word */
word: string;
/** The POS tag for the word */
tag: UniversalPOSTag;
/** Additional linguistic features (e.g., gender, number, case) */
features?: Record<string, string>;
}
/**
* Tags a single Bangla word with its part of speech
*
* @param {string} word - The Bangla word to tag
* @returns {TaggedWord} The word with its POS tag and features
*
* @description
* This function performs POS tagging for a single Bangla word.
* It uses a combination of rules and dictionary lookup to determine
* the most likely POS tag for the given word.
*
* @example
* ```typescript
* const result = tagWord("বাংলা");
* console.log(result);
* // Output: { word: "বাংলা", tag: UniversalPOSTag.NOUN }
* ```
*
* @example
* ```typescript
* const result = tagWord("সুন্দর");
* console.log(result);
* // Output: { word: "সুন্দর", tag: UniversalPOSTag.ADJ }
* ```
*/
export declare function tagWord(word: string): TaggedWord;
/**
* Tags a sequence of Bangla words with their parts of speech
*
* @param {string[]} words - Array of Bangla words to tag
* @returns {TaggedWord[]} Array of words with their POS tags and features
*
* @description
* This function performs POS tagging for a sequence of Bangla words.
* It takes into account the context of surrounding words to improve
* tagging accuracy.
*
* @example
* ```typescript
* const words = ["আমি", "বাংলায়", "গান", "গাই"];
* const tagged = tagWords(words);
* console.log(tagged);
* // Output: [
* // { word: "আমি", tag: UniversalPOSTag.PRON },
* // { word: "বাংলায়", tag: UniversalPOSTag.ADP },
* // { word: "গান", tag: UniversalPOSTag.NOUN },
* // { word: "গাই", tag: UniversalPOSTag.VERB }
* // ]
* ```
*/
export declare function tagWords(words: string[]): TaggedWord[];
/**
* Tags a Bangla text string with parts of speech
*
* @param {string} text - The Bangla text to tag
* @returns {TaggedWord[]} Array of words with their POS tags and features
*
* @description
* This function first tokenizes the input text into words and then
* performs POS tagging on the resulting word sequence.
*
* @example
* ```typescript
* const text = "আমি বাংলায় গান গাই";
* const tagged = tagText(text);
* console.log(tagged);
* // Output: [
* // { word: "আমি", tag: UniversalPOSTag.PRON },
* // { word: "বাংলায়", tag: UniversalPOSTag.ADP },
* // { word: "গান", tag: UniversalPOSTag.NOUN },
* // { word: "গাই", tag: UniversalPOSTag.VERB }
* // ]
* ```
*/
export declare function tagText(text: string): TaggedWord[];
/**
* Gets the most common POS tags in a sequence of tagged words
*
* @param {TaggedWord[]} taggedWords - Array of tagged words
* @returns {Map<UniversalPOSTag, number>} Map of POS tags to their frequencies
*
* @description
* This function analyzes a sequence of tagged words and returns
* a map showing how many times each POS tag appears.
*
* @example
* ```typescript
* const tagged = [
* { word: "আমি", tag: UniversalPOSTag.PRON },
* { word: "বাংলায়", tag: UniversalPOSTag.ADP },
* { word: "গান", tag: UniversalPOSTag.NOUN },
* { word: "গাই", tag: UniversalPOSTag.VERB }
* ];
* const frequencies = getPOSTagFrequencies(tagged);
* console.log(frequencies);
* // Output: Map(4) {
* // UniversalPOSTag.PRON => 1,
* // UniversalPOSTag.ADP => 1,
* // UniversalPOSTag.NOUN => 1,
* // UniversalPOSTag.VERB => 1
* // }
* ```
*/
export declare function getPOSTagFrequencies(taggedWords: TaggedWord[]): Map<UniversalPOSTag, number>;