botbuilder-dialogs
Version:
A dialog stack based conversation manager for Microsoft BotBuilder.
150 lines (138 loc) • 4.52 kB
text/typescript
/**
* @module botbuilder-dialogs
*/
/**
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License.
*/
/**
* Individual token returned by a `TokenizerFunction`.
*/
export interface Token {
/**
* Start character position of the token within the outer string.
*/
start: number;
/**
* End character position of the token within the outer string.
*/
end: number;
/**
* Original text of the token.
*/
text: string;
/**
* Normalized form of the token. This can include things like lower casing or stemming.
*/
normalized: string;
}
/**
* Signature for an alternate word breaker that can be passed to `recognizeChoices()`,
* `findChoices()`, or `findValues()`.
*
* ```TypeScript
* type TokenizerFunction = (text: string, locale?: string) => Token[];
* ```
*
* @remarks
* The `defaultTokenizer()` is fairly simple and only breaks on spaces and punctuation.
* @param TokenizerFunction.text The text to be tokenized.
* @param TokenizerFunction.locale (Optional) locale of the text if known.
*/
export type TokenizerFunction = (text: string, locale?: string) => Token[];
/**
* Simple tokenizer that breaks on spaces and punctuation.
*
* @param text The input text.
* @param _locale Optional, identifies the locale of the input text.
* @returns A list of tokens.
* @remarks
* The only normalization done is to lowercase the tokens. Developers can wrap this tokenizer with
* their own function to perform additional normalization like [stemming](https://github.com/words/stemmer).
*
* ```JavaScript
* const { recognizeChoices, defaultTokenizer } = require('botbuilder-choices');
* const stemmer = require('stemmer');
*
* function customTokenizer(text, locale) {
* const tokens = defaultTokenizer(text, locale);
* tokens.forEach((t) => {
* t.normalized = stemmer(t.normalized);
* });
* return tokens;
* }
*
* const choices = ['red', 'green', 'blue'];
* const utterance = context.activity.text;
* const results = recognizeChoices(utterance, choices, { tokenizer: customTokenizer });
* ```
*/
export function defaultTokenizer(text: string, _locale?: string): Token[] {
const tokens: Token[] = [];
let token: Token | undefined;
function appendToken(end: number): void {
if (token) {
token.end = end;
token.normalized = token.text.toLowerCase();
tokens.push(token);
token = undefined;
}
}
// Parse text
const length: number = text ? text.length : 0;
let i = 0;
while (i < length) {
// Get both the UNICODE value of the current character and the complete character itself
// which can potentially be multiple segments.
const codePoint: number = text.codePointAt(i) || text.charCodeAt(i);
const chr: string = String.fromCodePoint(codePoint);
// Process current character
if (isBreakingChar(codePoint)) {
// Character is in Unicode Plane 0 and is in an excluded block
appendToken(i - 1);
} else if (codePoint > 0xffff) {
// Character is in a Supplementary Unicode Plane. This is where emoji live so
// we're going to just break each character in this range out as its own token.
appendToken(i - 1);
tokens.push({
start: i,
end: i + (chr.length - 1),
text: chr,
normalized: chr,
});
} else if (!token) {
// Start a new token
token = { start: i, text: chr } as Token;
} else {
// Add on to current token
token.text += chr;
}
i += chr.length;
}
appendToken(length - 1);
return tokens;
}
/**
* @private
* @param codePoint number of character
*/
function isBreakingChar(codePoint: number): boolean {
return (
isBetween(codePoint, 0x0000, 0x002f) ||
isBetween(codePoint, 0x003a, 0x0040) ||
isBetween(codePoint, 0x005b, 0x0060) ||
isBetween(codePoint, 0x007b, 0x00bf) ||
isBetween(codePoint, 0x02b9, 0x036f) ||
isBetween(codePoint, 0x2000, 0x2bff) ||
isBetween(codePoint, 0x2e00, 0x2e7f)
);
}
/**
* @private
* @param value number value
* @param from low range
* @param to high range
*/
function isBetween(value: number, from: number, to: number): boolean {
return value >= from && value <= to;
}