@bbc/react-transcript-editor
Version:
A React component to make transcribing audio and video easier and faster.
17 lines • 3.17 kB
JavaScript
;var _index=_interopRequireDefault(require("../generate-entities-ranges/index.js"));Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;function _interopRequireDefault(obj){return obj&&obj.__esModule?obj:{default:obj}}/**
* Convert Speechmatics Json to DraftJs
* see `sample` folder for example of input and output as well as `example-usage.js`
*/ /**
* groups words list from speechmatics based on punctuation.
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
* @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file?
* @param {array} words - array of words objects from speechmatics transcript
*/var groupWordsInParagraphs=function(words){var results=[],paragraph={words:[],text:[]};return words.forEach(function(word){/[.?!]/.test(word.punct)?(paragraph.words.push(word),paragraph.text.push(word.punct),results.push(paragraph),paragraph={words:[],text:[]}):(paragraph.words.push(word),paragraph.text.push(word.punct))}),results},getSpeaker=function(start,speakers){for(var speakerIdx in speakers){var speaker=speakers[speakerIdx];if(start>=speaker.start&start<speaker.end)return speaker.name}return"UNK"},curatePunctuation=function(words){var curatedWords=[];return words.forEach(function(word){/[.?!]/.test(word.name)?(curatedWords[curatedWords.length-1].name=curatedWords[curatedWords.length-1].name+word.name,curatedWords[curatedWords.length-1].duration=(parseFloat(curatedWords[curatedWords.length-1].duration)+parseFloat(word.duration)).toString()):curatedWords.push(word)}),curatedWords},speechmaticsToDraft=function(speechmaticsJson){var tmpWords,results=[];tmpWords=curatePunctuation(speechmaticsJson.words),tmpWords=tmpWords.map(function(element,index){return{start:element.time,end:(parseFloat(element.time)+parseFloat(element.duration)).toString(),confidence:element.confidence,word:element.name.toLowerCase().replace(/[.?!]/g,""),punct:element.name,index:index}});var tmpSpeakers;tmpSpeakers=speechmaticsJson.speakers,tmpSpeakers=tmpSpeakers.map(function(element){return{start:element.time,end:(parseFloat(element.time)+parseFloat(element.duration)).toString(),name:element.name}});var wordsByParagraphs=groupWordsInParagraphs(tmpWords);return wordsByParagraphs.forEach(function(paragraph){var paragraphStart=paragraph.words[0].start,draftJsContentBlockParagraph={text:paragraph.text.join(" "),type:"paragraph",data:{speaker:getSpeaker(paragraphStart,tmpSpeakers),words:paragraph.words,start:paragraphStart},// the entities as ranges are each word in the space-joined text,
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
entityRanges:(0,_index.default)(paragraph.words,"punct")// wordAttributeName
};results.push(draftJsContentBlockParagraph)}),results},_default=speechmaticsToDraft;/**
* Determines the speaker of a paragraph by comparing the start time of the paragraph with
* the speaker times.
* @param {float} start - Starting point of paragraph
* @param {array} speakers - list of all speakers with start and end time
*/exports.default=_default;