watson-speech
Version:
IBM Watson Speech to Text and Text to Speech SDK for web browsers.
179 lines (144 loc) • 5.68 kB
HTML
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>JSDoc: Source: format-stream.js</title>
<script src="scripts/prettify/prettify.js"> </script>
<script src="scripts/prettify/lang-css.js"> </script>
<!--[if lt IE 9]>
<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->
<link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css">
<link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css">
</head>
<body>
<div id="main">
<h1 class="page-title">Source: format-stream.js</h1>
<section>
<article>
<pre class="prettyprint source linenums"><code>'use strict';
var Transform = require('stream').Transform;
var util = require('util');
var clone = require('clone');
/**
* Applies some basic formating to transcriptions:
* - Capitalize the first word of each sentence
* - Add a period to the end
* - Fix any "cruft" in the transcription
* - etc.
*
* @param opts
* @param opts.model - some models / languages need special handling
* @param [opts.hesitation='\u2026'] - what to put down for a "hesitation" event, defaults to an ellipsis (...)
* @constructor
*/
function FormatStream(opts) {
this.opts = util._extend({
model: '', // some models should have all spaces removed
hesitation: '\u2026', // ellipsis
decodeStrings: true
}, opts);
Transform.call(this, opts);
this.isJaCn = ((this.opts.model.substring(0,5) === 'ja-JP') || (this.opts.model.substring(0,5) === 'zh-CN'));
var self = this;
this.on('pipe', function(source) {
source.on('result', self.handleResult.bind(self));
if(source.stop) {
self.stop = source.stop.bind(source);
}
});
}
util.inherits(FormatStream, Transform);
var reHesitation = /%HESITATION\s/g; // when the service tetects a "hesitation" pause, it literally puts the string "%HESITATION" into the transcription
var reRepeatedCharacter = /(.)\1{2,}/g; // detect the same character repeated three or more times and remove it
var reDUnderscoreWords = /D_[^\s]+/g; // replace D_(anything)
/**
* Formats one or more words, removing special symbols, junk, and spacing for some languages
* @param text
* @param isFinal
* @returns {String}
*/
FormatStream.prototype.clean = function clean(text) {
// clean out "junk"
text = text.trim().replace(reHesitation, this.opts.hesitation)
.replace(reRepeatedCharacter, '')
.replace(reDUnderscoreWords,'');
// short-circuit if there's no actual text (avoids getting multiple periods after a pause)
if (!text) {
return text;
}
// remove spaces for Japanese and Chinese
if (this.isJaCn) {
text = text.replace(/ /g,'');
}
return text;
};
/**
* Capitalizes the first word of a sentence
* @param text
* @returns {string}
*/
FormatStream.prototype.capitalize = function capitalize(text) {
// capitalize first word, returns '' in the case of an empty word
return text.charAt(0).toUpperCase() + text.substring(1);
};
/**
* puts a period on the end of a sentence
* @param text
* @returns {string}
*/
FormatStream.prototype.period = function period(text) {
return text + (this.isJaCn ? '。' : '. ')
};
FormatStream.prototype._transform = function(chunk, encoding, next) {
this.push(this.period(this.capitalize(this.clean(chunk.toString()))));
next();
};
/**
* Creates a new result with all transcriptions formatted
*
* @param result
*/
FormatStream.prototype.handleResult = function handleResult(result) {
result = clone(result);
result.alternatives = result.alternatives.map(function(alt) {
alt.transcript = this.capitalize(this.clean(alt.transcript));
if (result.final) {
alt.transcript = this.period(alt.transcript)
}
if (alt.timestamps) {
alt.timestamps = alt.timestamps.map(function(ts, i, arr) {
// timestamps is an array of arrays, each sub-array is in the form ["word", startTime, endTime]'
ts[0] = this.clean(ts[0]);
if (i===0) {
ts[0] = this.capitalize(ts[0])
}
if (i == arr.length-1 && result.final) {
ts[0] = this.period(ts[0])
}
return ts;
}, this);
// todo: remove any timestamps without a word (due to cleaning out junk words)
}
return alt;
}, this);
this.emit('result', result);
};
FormatStream.prototype.promise = require('./promise');
FormatStream.prototype.stop = function(){}; // usually overwritten during the `pipe` event
module.exports = FormatStream;
</code></pre>
</article>
</section>
</div>
<nav>
<h2><a href="index.html">Home</a></h2><h3>Classes</h3><ul><li><a href="FormatStream.html">FormatStream</a></li><li><a href="MediaElementAudioStream.html">MediaElementAudioStream</a></li><li><a href="RecognizeStream.html">RecognizeStream</a></li><li><a href="TimingStream.html">TimingStream</a></li></ul><h3>Events</h3><ul><li><a href="MicrophoneStream.html#event:data">data</a></li><li><a href="MicrophoneStream.html#event:raw">raw</a></li><li><a href="RecognizeStream.html#event:connection-close">connection-close</a></li><li><a href="RecognizeStream.html#event:data">data</a></li><li><a href="RecognizeStream.html#event:error">error</a></li><li><a href="RecognizeStream.html#event:results">results</a></li></ul><h3>Global</h3><ul><li><a href="global.html#MAX_WAV">MAX_WAV</a></li></ul>
</nav>
<br class="clear">
<footer>
Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.4.0</a> on Mon Feb 08 2016 16:11:17 GMT+0000 (UTC)
</footer>
<script> prettyPrint(); </script>
<script src="scripts/linenumber.js"> </script>
</body>
</html>