UNPKG

@bbc/react-transcript-editor

Version:

A React component to make transcribing audio and video easier and faster.

17 lines 3.17 kB
"use strict";var _index=_interopRequireDefault(require("../generate-entities-ranges/index.js"));Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;function _interopRequireDefault(obj){return obj&&obj.__esModule?obj:{default:obj}}/** * Convert Speechmatics Json to DraftJs * see `sample` folder for example of input and output as well as `example-usage.js` */ /** * groups words list from speechmatics based on punctuation. * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words. * @todo As this function is also used in the bbc-kaldi adapter, should it be refactored into its own file? * @param {array} words - array of words objects from speechmatics transcript */var groupWordsInParagraphs=function(words){var results=[],paragraph={words:[],text:[]};return words.forEach(function(word){/[.?!]/.test(word.punct)?(paragraph.words.push(word),paragraph.text.push(word.punct),results.push(paragraph),paragraph={words:[],text:[]}):(paragraph.words.push(word),paragraph.text.push(word.punct))}),results},getSpeaker=function(start,speakers){for(var speakerIdx in speakers){var speaker=speakers[speakerIdx];if(start>=speaker.start&start<speaker.end)return speaker.name}return"UNK"},curatePunctuation=function(words){var curatedWords=[];return words.forEach(function(word){/[.?!]/.test(word.name)?(curatedWords[curatedWords.length-1].name=curatedWords[curatedWords.length-1].name+word.name,curatedWords[curatedWords.length-1].duration=(parseFloat(curatedWords[curatedWords.length-1].duration)+parseFloat(word.duration)).toString()):curatedWords.push(word)}),curatedWords},speechmaticsToDraft=function(speechmaticsJson){var tmpWords,results=[];tmpWords=curatePunctuation(speechmaticsJson.words),tmpWords=tmpWords.map(function(element,index){return{start:element.time,end:(parseFloat(element.time)+parseFloat(element.duration)).toString(),confidence:element.confidence,word:element.name.toLowerCase().replace(/[.?!]/g,""),punct:element.name,index:index}});var tmpSpeakers;tmpSpeakers=speechmaticsJson.speakers,tmpSpeakers=tmpSpeakers.map(function(element){return{start:element.time,end:(parseFloat(element.time)+parseFloat(element.duration)).toString(),name:element.name}});var wordsByParagraphs=groupWordsInParagraphs(tmpWords);return wordsByParagraphs.forEach(function(paragraph){var paragraphStart=paragraph.words[0].start,draftJsContentBlockParagraph={text:paragraph.text.join(" "),type:"paragraph",data:{speaker:getSpeaker(paragraphStart,tmpSpeakers),words:paragraph.words,start:paragraphStart},// the entities as ranges are each word in the space-joined text, // so it needs to be compute for each the offset from the beginning of the paragraph and the length entityRanges:(0,_index.default)(paragraph.words,"punct")// wordAttributeName };results.push(draftJsContentBlockParagraph)}),results},_default=speechmaticsToDraft;/** * Determines the speaker of a paragraph by comparing the start time of the paragraph with * the speaker times. * @param {float} start - Starting point of paragraph * @param {array} speakers - list of all speakers with start and end time */exports.default=_default;