UNPKG

watson-speech

Version:

IBM Watson Speech to Text and Text to Speech SDK for web browsers.

179 lines (144 loc) 5.68 kB
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>JSDoc: Source: format-stream.js</title> <script src="scripts/prettify/prettify.js"> </script> <script src="scripts/prettify/lang-css.js"> </script> <!--[if lt IE 9]> <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script> <![endif]--> <link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css"> <link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css"> </head> <body> <div id="main"> <h1 class="page-title">Source: format-stream.js</h1> <section> <article> <pre class="prettyprint source linenums"><code>'use strict'; var Transform = require('stream').Transform; var util = require('util'); var clone = require('clone'); /** * Applies some basic formating to transcriptions: * - Capitalize the first word of each sentence * - Add a period to the end * - Fix any "cruft" in the transcription * - etc. * * @param opts * @param opts.model - some models / languages need special handling * @param [opts.hesitation='\u2026'] - what to put down for a "hesitation" event, defaults to an ellipsis (...) * @constructor */ function FormatStream(opts) { this.opts = util._extend({ model: '', // some models should have all spaces removed hesitation: '\u2026', // ellipsis decodeStrings: true }, opts); Transform.call(this, opts); this.isJaCn = ((this.opts.model.substring(0,5) === 'ja-JP') || (this.opts.model.substring(0,5) === 'zh-CN')); var self = this; this.on('pipe', function(source) { source.on('result', self.handleResult.bind(self)); if(source.stop) { self.stop = source.stop.bind(source); } }); } util.inherits(FormatStream, Transform); var reHesitation = /%HESITATION\s/g; // when the service tetects a "hesitation" pause, it literally puts the string "%HESITATION" into the transcription var reRepeatedCharacter = /(.)\1{2,}/g; // detect the same character repeated three or more times and remove it var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything) /** * Formats one or more words, removing special symbols, junk, and spacing for some languages * @param text * @param isFinal * @returns {String} */ FormatStream.prototype.clean = function clean(text) { // clean out "junk" text = text.trim().replace(reHesitation, this.opts.hesitation) .replace(reRepeatedCharacter, '') .replace(reDUnderscoreWords,''); // short-circuit if there's no actual text (avoids getting multiple periods after a pause) if (!text) { return text; } // remove spaces for Japanese and Chinese if (this.isJaCn) { text = text.replace(/ /g,''); } return text; }; /** * Capitalizes the first word of a sentence * @param text * @returns {string} */ FormatStream.prototype.capitalize = function capitalize(text) { // capitalize first word, returns '' in the case of an empty word return text.charAt(0).toUpperCase() + text.substring(1); }; /** * puts a period on the end of a sentence * @param text * @returns {string} */ FormatStream.prototype.period = function period(text) { return text + (this.isJaCn ? '。' : '. ') }; FormatStream.prototype._transform = function(chunk, encoding, next) { this.push(this.period(this.capitalize(this.clean(chunk.toString())))); next(); }; /** * Creates a new result with all transcriptions formatted * * @param result */ FormatStream.prototype.handleResult = function handleResult(result) { result = clone(result); result.alternatives = result.alternatives.map(function(alt) { alt.transcript = this.capitalize(this.clean(alt.transcript)); if (result.final) { alt.transcript = this.period(alt.transcript) } if (alt.timestamps) { alt.timestamps = alt.timestamps.map(function(ts, i, arr) { // timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]' ts[0] = this.clean(ts[0]); if (i===0) { ts[0] = this.capitalize(ts[0]) } if (i == arr.length-1 &amp;&amp; result.final) { ts[0] = this.period(ts[0]) } return ts; }, this); // todo: remove any timestamps without a word (due to cleaning out junk words) } return alt; }, this); this.emit('result', result); }; FormatStream.prototype.promise = require('./promise'); FormatStream.prototype.stop = function(){}; // usually overwritten during the `pipe` event module.exports = FormatStream; </code></pre> </article> </section> </div> <nav> <h2><a href="index.html">Home</a></h2><h3>Classes</h3><ul><li><a href="FormatStream.html">FormatStream</a></li><li><a href="MediaElementAudioStream.html">MediaElementAudioStream</a></li><li><a href="RecognizeStream.html">RecognizeStream</a></li><li><a href="TimingStream.html">TimingStream</a></li></ul><h3>Events</h3><ul><li><a href="MicrophoneStream.html#event:data">data</a></li><li><a href="MicrophoneStream.html#event:raw">raw</a></li><li><a href="RecognizeStream.html#event:connection-close">connection-close</a></li><li><a href="RecognizeStream.html#event:data">data</a></li><li><a href="RecognizeStream.html#event:error">error</a></li><li><a href="RecognizeStream.html#event:results">results</a></li></ul><h3>Global</h3><ul><li><a href="global.html#MAX_WAV">MAX_WAV</a></li></ul> </nav> <br class="clear"> <footer> Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.4.0</a> on Mon Feb 08 2016 16:11:17 GMT+0000 (UTC) </footer> <script> prettyPrint(); </script> <script src="scripts/linenumber.js"> </script> </body> </html>