UNPKG

watson-speech

Version:

IBM Watson Speech to Text and Text to Speech SDK for web browsers.

159 lines (141 loc) 5.22 kB
'use strict'; var { Transform } = require('readable-stream'); var util = require('util'); var clone = require('clone'); var defaults = require('defaults'); /** * Applies some basic formatting to transcriptions: * - Capitalize the first word of each sentence * - Add a period to the end * - Fix any "cruft" in the transcription * - etc. * * May be used as either a Stream, or a standalone helper. * * @param {Object} opts * @param {String} [opts.model] - some models / languages need special handling * @param {String} [opts.hesitation=''] - what to put down for a "hesitation" event, also consider \u2026 (ellipsis: ...) * @param {Boolean} [options.objectMode=false] - emit `result` objects instead of string Buffers for the `data` events. * @constructor */ function FormatStream(opts) { this.options = defaults(opts, { model: '', // some models should have all spaces removed hesitation: '', decodeStrings: false // false = don't convert strings to buffers before passing to _write }); Transform.call(this, this.options); this.isJaCn = this.options.model.substring(0, 5) === 'ja-JP' || this.options.model.substring(0, 5) === 'zh-CN'; this._transform = this.options.objectMode ? this.transformObject : this.transformString; } util.inherits(FormatStream, Transform); var reHesitation = /%HESITATION ?/g; // https://console.bluemix.net/docs/services/speech-to-text/output.html#output - D_ is handled below var reRepeatedCharacter = /([a-z])\1{2,}/gi; // detect the same character repeated three or more times and remove it var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything) /** * Formats one or more words, removing special symbols, junk, and spacing for some languages * @param {String} text * @param {Boolean} isFinal * @return {String} */ FormatStream.prototype.clean = function clean(text) { // clean out "junk" text = text .replace(reHesitation, this.options.hesitation ? this.options.hesitation.trim() + ' ' : this.options.hesitation) .replace(reRepeatedCharacter, '') .replace(reDUnderscoreWords, ''); // remove spaces for Japanese and Chinese if (this.isJaCn) { text = text.replace(/ /g, ''); } return text.trim() + ' '; // we want exactly 1 space at the end }; /** * Capitalizes the first word of a sentence * @param {String} text * @return {string} */ FormatStream.prototype.capitalize = function capitalize(text) { // capitalize first word, returns '' in the case of an empty word return text.charAt(0).toUpperCase() + text.substring(1); }; /** * Puts a period on the end of a sentence * @param {String} text * @return {string} */ FormatStream.prototype.period = function period(text) { text = text.trim(); // don't put a period down if the clean stage remove all of the text if (!text) { return ' '; } // just add a space if the sentence ends in an ellipse if (text.substr(-1) === '\u2026') { return text + ' '; } return text + (this.isJaCn ? '。' : '. '); }; FormatStream.prototype.transformString = function(chunk, encoding, next) { this.push(this.formatString(chunk.toString())); next(); }; FormatStream.prototype.transformObject = function formatResult(result, encoding, next) { this.push(this.formatResult(result)); next(); }; /** * Formats a single string result. * * May be used outside of Node.js streams * * @param {String} str - text to format * @param {bool} [isInterim=false] - set to true to prevent adding a period to the end of the sentence * @return {String} */ FormatStream.prototype.formatString = function(str, isInterim) { str = this.capitalize(this.clean(str)); return isInterim ? str : this.period(str); }; /** * Creates a new result with all transcriptions formatted * * May be used outside of Node.js streams * * @param {Object} data * @return {Object} */ FormatStream.prototype.formatResult = function formatResult(data) { data = clone(data); if (Array.isArray(data.results)) { data.results.forEach(function(result, i) { // if there are multiple interim results (as produced by the speaker stream), // treat the text as final in all but the last result var textFinal = result.final || i !== data.results.length - 1; result.alternatives = result.alternatives.map(function(alt) { alt.transcript = this.formatString(alt.transcript, !textFinal); if (alt.timestamps) { alt.timestamps = alt.timestamps .map(function(ts, j, arr) { // timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]' ts[0] = this.clean(ts[0]); if (j === 0) { ts[0] = this.capitalize(ts[0]); } if (j === arr.length - 1 && textFinal) { ts[0] = this.period(ts[0]); } return ts; }, this) .filter(function(ts) { return ts[0]; // remove any timestamps without a word (due to cleaning out junk words) }); } return alt; }, this); }, this); } return data; }; FormatStream.prototype.promise = require('./to-promise'); module.exports = FormatStream;