@bbc/react-transcript-editor
Version:
A React component to make transcribing audio and video easier and faster.
17 lines • 4.48 kB
JavaScript
;var _index=_interopRequireDefault(require("../generate-entities-ranges/index.js"));Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=exports.mapPunctuationItemsToWords=exports.appendPunctuationToPreviousWord=exports.getBestAlternativeForWord=exports.stripLeadingSpace=void 0;function _interopRequireDefault(obj){return obj&&obj.__esModule?obj:{default:obj}}function _objectSpread(target){for(var i=1;i<arguments.length;i++){var source=null==arguments[i]?{}:arguments[i],ownKeys=Object.keys(source);"function"==typeof Object.getOwnPropertySymbols&&(ownKeys=ownKeys.concat(Object.getOwnPropertySymbols(source).filter(function(sym){return Object.getOwnPropertyDescriptor(source,sym).enumerable}))),ownKeys.forEach(function(key){_defineProperty(target,key,source[key])})}return target}function _defineProperty(obj,key,value){return key in obj?Object.defineProperty(obj,key,{value:value,enumerable:!0,configurable:!0,writable:!0}):obj[key]=value,obj}var stripLeadingSpace=function(word){return word.replace(/^\s/,"")};/**
* @param {json} words - List of words
* @param {string} wordAttributeName - eg 'punct' or 'text' or etc.
* attribute for the word object containing the text. eg word ={ punct:'helo', ... }
* or eg word ={ text:'helo', ... }
*/exports.stripLeadingSpace=stripLeadingSpace;var getBestAlternativeForWord=function(word){if(/punctuation/.test(word.type))return Object.assign(word.alternatives[0],{confidence:1});//Transcribe doesn't provide a confidence for punctuation
var wordWithHighestConfidence=word.alternatives.reduce(function(prev,current){return parseFloat(prev.confidence)>parseFloat(current.confidence)?prev:current});return wordWithHighestConfidence};/**
* Normalizes words so they can be used in
* the generic generateEntitiesRanges() method
**/exports.getBestAlternativeForWord=getBestAlternativeForWord;var normalizeWord=function(currentWord){var bestAlternative=getBestAlternativeForWord(currentWord);return{start:parseFloat(currentWord.start_time),end:parseFloat(currentWord.end_time),text:bestAlternative.content,confidence:parseFloat(bestAlternative.confidence)}},appendPunctuationToPreviousWord=function(punctuation,previousWord){var punctuationContent=punctuation.alternatives[0].content;return _objectSpread({},previousWord,{alternatives:previousWord.alternatives.map(function(w){return _objectSpread({},w,{content:w.content+stripLeadingSpace(punctuationContent)})})})};exports.appendPunctuationToPreviousWord=appendPunctuationToPreviousWord;var mapPunctuationItemsToWords=function(words){var itemsToRemove=[],dirtyArray=words.map(function(word,index){var previousWord={};return"punctuation"===word.type?(itemsToRemove.push(index-1),previousWord=words[index-1],appendPunctuationToPreviousWord(word,previousWord)):word});return dirtyArray.filter(function(item,index){return!itemsToRemove.includes(index)})};/**
* groups words list from amazon transcribe transcript based on punctuation.
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
* @param {array} words - array of words objects from kaldi transcript
*/exports.mapPunctuationItemsToWords=mapPunctuationItemsToWords;var groupWordsInParagraphs=function(words){var results=[],paragraph={words:[],text:[]};return words.forEach(function(word){var content=getBestAlternativeForWord(word).content,normalizedWord=normalizeWord(word);/[.?!]/.test(content)?(paragraph.words.push(normalizedWord),paragraph.text.push(content),results.push(paragraph),paragraph={words:[],text:[]}):(paragraph.words.push(normalizedWord),paragraph.text.push(content))}),results},amazonTranscribeToDraft=function(amazonTranscribeJson){var results=[],tmpWords=amazonTranscribeJson.results.items,wordsWithRemappedPunctuation=mapPunctuationItemsToWords(tmpWords),wordsByParagraphs=groupWordsInParagraphs(wordsWithRemappedPunctuation);return wordsByParagraphs.forEach(function(paragraph,i){var draftJsContentBlockParagraph={text:paragraph.text.join(" "),type:"paragraph",data:{speaker:"TBC ".concat(i),words:paragraph.words,start:parseFloat(paragraph.words[0].start)},// the entities as ranges are each word in the space-joined text,
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
entityRanges:(0,_index.default)(paragraph.words,"text")// wordAttributeName
};results.push(draftJsContentBlockParagraph)}),results},_default=amazonTranscribeToDraft;exports.default=_default;