UNPKG

watson-speech

Version:

IBM Watson Speech to Text and Text to Speech SDK for web browsers.

219 lines (183 loc) 8.77 kB
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>JSDoc: Source: speech-to-text/format-stream.js</title> <script src="scripts/prettify/prettify.js"> </script> <script src="scripts/prettify/lang-css.js"> </script> <!--[if lt IE 9]> <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script> <![endif]--> <link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css"> <link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css"> </head> <body> <div id="main"> <h1 class="page-title">Source: speech-to-text/format-stream.js</h1> <section> <article> <pre class="prettyprint source linenums"><code>'use strict'; var Transform = require('stream').Transform; var util = require('util'); var clone = require('clone'); var defaults = require('defaults'); /** * Applies some basic formatting to transcriptions: * - Capitalize the first word of each sentence * - Add a period to the end * - Fix any "cruft" in the transcription * - etc. * * May be used as either a Stream, or a standalone helper. * * @param {Object} opts * @param {String} [opts.model] - some models / languages need special handling * @param {String} [opts.hesitation=''] - what to put down for a "hesitation" event, also consider \u2026 (ellipsis: ...) * @param {Boolean} [options.objectMode=false] - emit `result` objects instead of string Buffers for the `data` events. * @constructor */ function FormatStream(opts) { this.options = defaults(opts, { model: '', // some models should have all spaces removed hesitation: '', decodeStrings: false // false = don't convert strings to buffers before passing to _write }); Transform.call(this, this.options); this.isJaCn = this.options.model.substring(0, 5) === 'ja-JP' || this.options.model.substring(0, 5) === 'zh-CN'; this._transform = this.options.objectMode ? this.transformObject : this.transformString; } util.inherits(FormatStream, Transform); var reHesitation = /%HESITATION ?/g; // http://www.ibm.com/watson/developercloud/doc/speech-to-text/output.shtml#hesitation - D_ is handled below var reRepeatedCharacter = /([a-z])\1{2,}/ig; // detect the same character repeated three or more times and remove it var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything) /** * Formats one or more words, removing special symbols, junk, and spacing for some languages * @param {String} text * @param {Boolean} isFinal * @return {String} */ FormatStream.prototype.clean = function clean(text) { // clean out "junk" text = text .replace(reHesitation, this.options.hesitation ? this.options.hesitation.trim() + ' ' : this.options.hesitation) .replace(reRepeatedCharacter, '') .replace(reDUnderscoreWords, ''); // remove spaces for Japanese and Chinese if (this.isJaCn) { text = text.replace(/ /g, ''); } return text.trim() + ' '; // we want exactly 1 space at the end }; /** * Capitalizes the first word of a sentence * @param {String} text * @return {string} */ FormatStream.prototype.capitalize = function capitalize(text) { // capitalize first word, returns '' in the case of an empty word return text.charAt(0).toUpperCase() + text.substring(1); }; /** * Puts a period on the end of a sentence * @param {String} text * @return {string} */ FormatStream.prototype.period = function period(text) { text = text.trim(); // don't put a period down if the clean stage remove all of the text if (!text) { return ' '; } // just add a space if the sentence ends in an ellipse if (text.substr(-1) === '\u2026') { return text + ' '; } return text + (this.isJaCn ? '。' : '. '); }; FormatStream.prototype.transformString = function(chunk, encoding, next) { this.push(this.formatString(chunk.toString())); next(); }; FormatStream.prototype.transformObject = function formatResult(result, encoding, next) { this.push(this.formatResult(result)); next(); }; /** * Formats a single string result. * * May be used outside of Node.js streams * * @param {String} str - text to format * @param {bool} [isInterim=false] - set to true to prevent adding a period to the end of the sentence * @return {String} */ FormatStream.prototype.formatString = function(str, isInterim) { str = this.capitalize(this.clean(str)); return isInterim ? str : this.period(str); }; /** * Creates a new result with all transcriptions formatted * * May be used outside of Node.js streams * * @param {Object} data * @return {Object} */ FormatStream.prototype.formatResult = function formatResult(data) { data = clone(data); if (Array.isArray(data.results)) { data.results.forEach( function(result, i) { // if there are multiple interim results (as produced by the speaker stream), // treat the text as final in all but the last result var textFinal = result.final || i !== data.results.length - 1; result.alternatives = result.alternatives.map( function(alt) { alt.transcript = this.formatString(alt.transcript, !textFinal); if (alt.timestamps) { alt.timestamps = alt.timestamps .map( function(ts, j, arr) { // timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]' ts[0] = this.clean(ts[0]); if (j === 0) { ts[0] = this.capitalize(ts[0]); } if (j === arr.length - 1 &amp;&amp; textFinal) { ts[0] = this.period(ts[0]); } return ts; }, this ) .filter(function(ts) { return ts[0]; // remove any timestamps without a word (due to cleaning out junk words) }); } return alt; }, this ); }, this ); } return data; }; FormatStream.prototype.promise = require('./to-promise'); module.exports = FormatStream; </code></pre> </article> </section> </div> <nav> <h2><a href="index.html">Home</a></h2><h3>Modules</h3><ul><li><a href="module-watson-speech.html">watson-speech</a></li><li><a href="module-watson-speech_speech-to-text.html">watson-speech/speech-to-text</a></li><li><a href="module-watson-speech_speech-to-text_get-models.html">watson-speech/speech-to-text/get-models</a></li><li><a href="module-watson-speech_speech-to-text_recognize-file.html">watson-speech/speech-to-text/recognize-file</a></li><li><a href="module-watson-speech_speech-to-text_recognize-microphone.html">watson-speech/speech-to-text/recognize-microphone</a></li><li><a href="module-watson-speech_text-to-speech.html">watson-speech/text-to-speech</a></li><li><a href="module-watson-speech_text-to-speech_get-voices.html">watson-speech/text-to-speech/get-voices</a></li><li><a href="module-watson-speech_text-to-speech_synthesize.html">watson-speech/text-to-speech/synthesize</a></li></ul><h3>Classes</h3><ul><li><a href="FilePlayer.html">FilePlayer</a></li><li><a href="FormatStream.html">FormatStream</a></li><li><a href="RecognizeStream.html">RecognizeStream</a></li><li><a href="ResultStream.html">ResultStream</a></li><li><a href="SpeakerStream.html">SpeakerStream</a></li><li><a href="TimingStream.html">TimingStream</a></li><li><a href="UrlPlayer.html">UrlPlayer</a></li><li><a href="WebAudioL16Stream.html">WebAudioL16Stream</a></li><li><a href="WritableElementStream.html">WritableElementStream</a></li></ul><h3>Events</h3><ul><li><a href="RecognizeStream.html#event:close">close</a></li><li><a href="RecognizeStream.html#event:data">data</a></li><li><a href="RecognizeStream.html#event:error">error</a></li><li><a href="RecognizeStream.html#event:listening">listening</a></li><li><a href="RecognizeStream.html#event:message">message</a></li><li><a href="RecognizeStream.html#event:open">open</a></li><li><a href="RecognizeStream.html#event:send-data">send-data</a></li><li><a href="RecognizeStream.html#event:send-json">send-json</a></li><li><a href="RecognizeStream.html#event:stop">stop</a></li><li><a href="SpeakerStream.html#event:data">data</a></li></ul><h3>Global</h3><ul><li><a href="global.html#getContentTypeFromFile">getContentTypeFromFile</a></li><li><a href="global.html#playFile">playFile</a></li></ul> </nav> <br class="clear"> <footer> Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.4.3</a> on Tue Feb 21 2017 17:41:51 GMT+0000 (UTC) </footer> <script> prettyPrint(); </script> <script src="scripts/linenumber.js"> </script> </body> </html>