UNPKG

type2docfx

Version:

A tool to convert json format output from TypeDoc to universal reference model for DocFx to consume.

github.com/wuhanyumsft/type2docfx

wuhanyumsft/type2docfx

97 lines • 3.33 kB

JavaScript

"use strict"; /** * @module botbuilder-choices */ /** * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. */ Object.defineProperty(exports, "__esModule", { value: true }); /** * :package: **botbuilder-choices** * * Simple tokenizer that breaks on spaces and punctuation. The only normalization done is to * lowercase the tokens. Developers can wrap this tokenizer with their own function to perform * additional normalization like [stemming](https://github.com/words/stemmer). * * **Usage Example** * * ```JavaScript * const { recognizeChoices, defaultTokenizer } = require('botbuilder-choices'); * const stemmer = require('stemmer'); * * function customTokenizer(text, locale) { * const tokens = defaultTokenizer(text, locale); * tokens.forEach((t) => { * t.normalized = stemmer(t.normalized); * }); * return tokens; * } * * const choices = ['red', 'green', 'blue']; * const utterance = context.activity.text; * const results = recognizeChoices(utterance, choices, { tokenizer: customTokenizer }); * ``` */ function defaultTokenizer(text, locale) { const tokens = []; let token; function appendToken(end) { if (token) { token.end = end; token.normalized = token.text.toLowerCase(); tokens.push(token); token = undefined; } } // Parse text const length = text ? text.length : 0; let i = 0; while (i < length) { // Get both the UNICODE value of the current character and the complete character itself // which can potentially be multiple segments. const codePoint = text.codePointAt(i) || text.charCodeAt(i); const chr = String.fromCodePoint(codePoint); // Process current character if (isBreakingChar(codePoint)) { // Character is in Unicode Plane 0 and is in an excluded block appendToken(i - 1); } else if (codePoint > 0xFFFF) { // Character is in a Supplementary Unicode Plane. This is where emoji live so // we're going to just break each character in this range out as its own token. appendToken(i - 1); tokens.push({ start: i, end: i + (chr.length - 1), text: chr, normalized: chr }); } else if (!token) { // Start a new token token = { start: i, text: chr }; } else { // Add on to current token token.text += chr; } i += chr.length; } appendToken(length - 1); return tokens; } exports.defaultTokenizer = defaultTokenizer; function isBreakingChar(codePoint) { return (isBetween(codePoint, 0x0000, 0x002F) || isBetween(codePoint, 0x003A, 0x0040) || isBetween(codePoint, 0x005B, 0x0060) || isBetween(codePoint, 0x007B, 0x00BF) || isBetween(codePoint, 0x02B9, 0x036F) || isBetween(codePoint, 0x2000, 0x2BFF) || isBetween(codePoint, 0x2E00, 0x2E7F)); } function isBetween(value, from, to) { return (value >= from && value <= to); } //# sourceMappingURL=tokenizer.js.map