@bbc/react-transcript-editor
Version:
A React component to make transcribing audio and video easier and faster.
14 lines • 2.42 kB
JavaScript
;Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;var _index=_interopRequireDefault(require("../generate-entities-ranges/index.js")),_groupWordsBySpeakers=_interopRequireDefault(require("./group-words-by-speakers.js"));function _interopRequireDefault(obj){return obj&&obj.__esModule?obj:{default:obj}}/**
* Convert BBC Kaldi json to draftJs
* see `sample` folder for example of input and output as well as `example-usage.js`
*
*/ /**
* groups words list from kaldi transcript based on punctuation.
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
* @param {array} words - array of words opbjects from kaldi transcript
*/var groupWordsInParagraphs=function(words){var results=[],paragraph={words:[],text:[]};return words.forEach(function(word){/[.?!]/.test(word.punct)?(paragraph.words.push(word),paragraph.text.push(word.punct),paragraph.text=paragraph.text.join(" "),results.push(paragraph),paragraph={words:[],text:[]}):(paragraph.words.push(word),paragraph.text.push(word.punct))}),results},bbcKaldiToDraft=function(bbcKaldiJson){var tmpWords,results=[],speakerSegmentation=null,wordsByParagraphs=[];return void 0===bbcKaldiJson.retval?(tmpWords=bbcKaldiJson.words,void 0!==bbcKaldiJson.segmentation&&(speakerSegmentation=bbcKaldiJson.segmentation)):(tmpWords=bbcKaldiJson.retval.words,void 0!==bbcKaldiJson.retval.segmentation&&(speakerSegmentation=bbcKaldiJson.retval.segmentation)),wordsByParagraphs=null===speakerSegmentation?groupWordsInParagraphs(tmpWords):(0,_groupWordsBySpeakers.default)(tmpWords,speakerSegmentation),wordsByParagraphs.forEach(function(paragraph,i){// if paragraph contain words
// eg sometimes the speaker segmentation might not contain words :man-shrugging:
if(void 0!==paragraph.words[0]){var speakerLabel="TBC ".concat(i);null!==speakerSegmentation&&(speakerLabel=paragraph.speaker);var draftJsContentBlockParagraph={text:paragraph.text,type:"paragraph",data:{speaker:speakerLabel,words:paragraph.words,start:paragraph.words[0].start},// the entities as ranges are each word in the space-joined text,
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
entityRanges:(0,_index.default)(paragraph.words,"punct")// wordAttributeName
};results.push(draftJsContentBlockParagraph)}}),results},_default=bbcKaldiToDraft;exports.default=_default;