watson-speech
Version:
IBM Watson Speech to Text and Text to Speech SDK for web browsers.
219 lines (183 loc) • 8.77 kB
HTML
<html lang="en">
<head>
<meta charset="utf-8">
<title>JSDoc: Source: speech-to-text/format-stream.js</title>
<script src="scripts/prettify/prettify.js"> </script>
<script src="scripts/prettify/lang-css.js"> </script>
<!--[if lt IE 9]>
<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->
<link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css">
<link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css">
</head>
<body>
<div id="main">
<h1 class="page-title">Source: speech-to-text/format-stream.js</h1>
<section>
<article>
<pre class="prettyprint source linenums"><code>'use strict';
var Transform = require('stream').Transform;
var util = require('util');
var clone = require('clone');
var defaults = require('defaults');
/**
* Applies some basic formatting to transcriptions:
* - Capitalize the first word of each sentence
* - Add a period to the end
* - Fix any "cruft" in the transcription
* - etc.
*
* May be used as either a Stream, or a standalone helper.
*
* @param {Object} opts
* @param {String} [opts.model] - some models / languages need special handling
* @param {String} [opts.hesitation=''] - what to put down for a "hesitation" event, also consider \u2026 (ellipsis: ...)
* @param {Boolean} [options.objectMode=false] - emit `result` objects instead of string Buffers for the `data` events.
* @constructor
*/
function FormatStream(opts) {
this.options = defaults(opts, {
model: '', // some models should have all spaces removed
hesitation: '',
decodeStrings: false // false = don't convert strings to buffers before passing to _write
});
Transform.call(this, this.options);
this.isJaCn = this.options.model.substring(0, 5) === 'ja-JP' || this.options.model.substring(0, 5) === 'zh-CN';
this._transform = this.options.objectMode ? this.transformObject : this.transformString;
}
util.inherits(FormatStream, Transform);
var reHesitation = /%HESITATION ?/g; // http://www.ibm.com/watson/developercloud/doc/speech-to-text/output.shtml#hesitation - D_ is handled below
var reRepeatedCharacter = /([a-z])\1{2,}/ig; // detect the same character repeated three or more times and remove it
var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)
/**
* Formats one or more words, removing special symbols, junk, and spacing for some languages
* @param {String} text
* @param {Boolean} isFinal
* @return {String}
*/
FormatStream.prototype.clean = function clean(text) {
// clean out "junk"
text = text
.replace(reHesitation, this.options.hesitation ? this.options.hesitation.trim() + ' ' : this.options.hesitation)
.replace(reRepeatedCharacter, '')
.replace(reDUnderscoreWords, '');
// remove spaces for Japanese and Chinese
if (this.isJaCn) {
text = text.replace(/ /g, '');
}
return text.trim() + ' '; // we want exactly 1 space at the end
};
/**
* Capitalizes the first word of a sentence
* @param {String} text
* @return {string}
*/
FormatStream.prototype.capitalize = function capitalize(text) {
// capitalize first word, returns '' in the case of an empty word
return text.charAt(0).toUpperCase() + text.substring(1);
};
/**
* Puts a period on the end of a sentence
* @param {String} text
* @return {string}
*/
FormatStream.prototype.period = function period(text) {
text = text.trim();
// don't put a period down if the clean stage remove all of the text
if (!text) {
return ' ';
}
// just add a space if the sentence ends in an ellipse
if (text.substr(-1) === '\u2026') {
return text + ' ';
}
return text + (this.isJaCn ? '。' : '. ');
};
FormatStream.prototype.transformString = function(chunk, encoding, next) {
this.push(this.formatString(chunk.toString()));
next();
};
FormatStream.prototype.transformObject = function formatResult(result, encoding, next) {
this.push(this.formatResult(result));
next();
};
/**
* Formats a single string result.
*
* May be used outside of Node.js streams
*
* @param {String} str - text to format
* @param {bool} [isInterim=false] - set to true to prevent adding a period to the end of the sentence
* @return {String}
*/
FormatStream.prototype.formatString = function(str, isInterim) {
str = this.capitalize(this.clean(str));
return isInterim ? str : this.period(str);
};
/**
* Creates a new result with all transcriptions formatted
*
* May be used outside of Node.js streams
*
* @param {Object} data
* @return {Object}
*/
FormatStream.prototype.formatResult = function formatResult(data) {
data = clone(data);
if (Array.isArray(data.results)) {
data.results.forEach(
function(result, i) {
// if there are multiple interim results (as produced by the speaker stream),
// treat the text as final in all but the last result
var textFinal = result.final || i !== data.results.length - 1;
result.alternatives = result.alternatives.map(
function(alt) {
alt.transcript = this.formatString(alt.transcript, !textFinal);
if (alt.timestamps) {
alt.timestamps = alt.timestamps
.map(
function(ts, j, arr) {
// timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]'
ts[0] = this.clean(ts[0]);
if (j === 0) {
ts[0] = this.capitalize(ts[0]);
}
if (j === arr.length - 1 && textFinal) {
ts[0] = this.period(ts[0]);
}
return ts;
},
this
)
.filter(function(ts) {
return ts[0]; // remove any timestamps without a word (due to cleaning out junk words)
});
}
return alt;
},
this
);
},
this
);
}
return data;
};
FormatStream.prototype.promise = require('./to-promise');
module.exports = FormatStream;
</code></pre>
</article>
</section>
</div>
<nav>
<h2><a href="index.html">Home</a></h2><h3>Modules</h3><ul><li><a href="module-watson-speech.html">watson-speech</a></li><li><a href="module-watson-speech_speech-to-text.html">watson-speech/speech-to-text</a></li><li><a href="module-watson-speech_speech-to-text_get-models.html">watson-speech/speech-to-text/get-models</a></li><li><a href="module-watson-speech_speech-to-text_recognize-file.html">watson-speech/speech-to-text/recognize-file</a></li><li><a href="module-watson-speech_speech-to-text_recognize-microphone.html">watson-speech/speech-to-text/recognize-microphone</a></li><li><a href="module-watson-speech_text-to-speech.html">watson-speech/text-to-speech</a></li><li><a href="module-watson-speech_text-to-speech_get-voices.html">watson-speech/text-to-speech/get-voices</a></li><li><a href="module-watson-speech_text-to-speech_synthesize.html">watson-speech/text-to-speech/synthesize</a></li></ul><h3>Classes</h3><ul><li><a href="FilePlayer.html">FilePlayer</a></li><li><a href="FormatStream.html">FormatStream</a></li><li><a href="RecognizeStream.html">RecognizeStream</a></li><li><a href="ResultStream.html">ResultStream</a></li><li><a href="SpeakerStream.html">SpeakerStream</a></li><li><a href="TimingStream.html">TimingStream</a></li><li><a href="UrlPlayer.html">UrlPlayer</a></li><li><a href="WebAudioL16Stream.html">WebAudioL16Stream</a></li><li><a href="WritableElementStream.html">WritableElementStream</a></li></ul><h3>Events</h3><ul><li><a href="RecognizeStream.html#event:close">close</a></li><li><a href="RecognizeStream.html#event:data">data</a></li><li><a href="RecognizeStream.html#event:error">error</a></li><li><a href="RecognizeStream.html#event:listening">listening</a></li><li><a href="RecognizeStream.html#event:message">message</a></li><li><a href="RecognizeStream.html#event:open">open</a></li><li><a href="RecognizeStream.html#event:send-data">send-data</a></li><li><a href="RecognizeStream.html#event:send-json">send-json</a></li><li><a href="RecognizeStream.html#event:stop">stop</a></li><li><a href="SpeakerStream.html#event:data">data</a></li></ul><h3>Global</h3><ul><li><a href="global.html#getContentTypeFromFile">getContentTypeFromFile</a></li><li><a href="global.html#playFile">playFile</a></li></ul>
</nav>
<br class="clear">
<footer>
Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.4.3</a> on Tue Feb 21 2017 17:41:51 GMT+0000 (UTC)
</footer>
<script> prettyPrint(); </script>
<script src="scripts/linenumber.js"> </script>
</body>
</html>