UNPKG

botbuilder-dialogs

Version:

A dialog stack based conversation manager for Microsoft BotBuilder.

github.com/Microsoft/botbuilder-js

Microsoft/botbuilder-js

109 lines • 3.51 kB

JavaScript

"use strict"; /** * @module botbuilder-dialogs */ /** * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.defaultTokenizer = void 0; /** * Simple tokenizer that breaks on spaces and punctuation. * * @param text The input text. * @param _locale Optional, identifies the locale of the input text. * @returns A list of tokens. * @remarks * The only normalization done is to lowercase the tokens. Developers can wrap this tokenizer with * their own function to perform additional normalization like [stemming](https://github.com/words/stemmer). * * ```JavaScript * const { recognizeChoices, defaultTokenizer } = require('botbuilder-choices'); * const stemmer = require('stemmer'); * * function customTokenizer(text, locale) { * const tokens = defaultTokenizer(text, locale); * tokens.forEach((t) => { * t.normalized = stemmer(t.normalized); * }); * return tokens; * } * * const choices = ['red', 'green', 'blue']; * const utterance = context.activity.text; * const results = recognizeChoices(utterance, choices, { tokenizer: customTokenizer }); * ``` */ function defaultTokenizer(text, _locale) { const tokens = []; let token; function appendToken(end) { if (token) { token.end = end; token.normalized = token.text.toLowerCase(); tokens.push(token); token = undefined; } } // Parse text const length = text ? text.length : 0; let i = 0; while (i < length) { // Get both the UNICODE value of the current character and the complete character itself // which can potentially be multiple segments. const codePoint = text.codePointAt(i) || text.charCodeAt(i); const chr = String.fromCodePoint(codePoint); // Process current character if (isBreakingChar(codePoint)) { // Character is in Unicode Plane 0 and is in an excluded block appendToken(i - 1); } else if (codePoint > 0xffff) { // Character is in a Supplementary Unicode Plane. This is where emoji live so // we're going to just break each character in this range out as its own token. appendToken(i - 1); tokens.push({ start: i, end: i + (chr.length - 1), text: chr, normalized: chr, }); } else if (!token) { // Start a new token token = { start: i, text: chr }; } else { // Add on to current token token.text += chr; } i += chr.length; } appendToken(length - 1); return tokens; } exports.defaultTokenizer = defaultTokenizer; /** * @private * @param codePoint number of character */ function isBreakingChar(codePoint) { return (isBetween(codePoint, 0x0000, 0x002f) || isBetween(codePoint, 0x003a, 0x0040) || isBetween(codePoint, 0x005b, 0x0060) || isBetween(codePoint, 0x007b, 0x00bf) || isBetween(codePoint, 0x02b9, 0x036f) || isBetween(codePoint, 0x2000, 0x2bff) || isBetween(codePoint, 0x2e00, 0x2e7f)); } /** * @private * @param value number value * @param from low range * @param to high range */ function isBetween(value, from, to) { return value >= from && value <= to; } //# sourceMappingURL=tokenizer.js.map