UNPKG

@bbc/react-transcript-editor

Version:

A React component to make transcribing audio and video easier and faster.

14 lines 2.42 kB
"use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;var _index=_interopRequireDefault(require("../generate-entities-ranges/index.js")),_groupWordsBySpeakers=_interopRequireDefault(require("./group-words-by-speakers.js"));function _interopRequireDefault(obj){return obj&&obj.__esModule?obj:{default:obj}}/** * Convert BBC Kaldi json to draftJs * see `sample` folder for example of input and output as well as `example-usage.js` * */ /** * groups words list from kaldi transcript based on punctuation. * @todo To be more accurate, should introduce an honorifics library to do the splitting of the words. * @param {array} words - array of words opbjects from kaldi transcript */var groupWordsInParagraphs=function(words){var results=[],paragraph={words:[],text:[]};return words.forEach(function(word){/[.?!]/.test(word.punct)?(paragraph.words.push(word),paragraph.text.push(word.punct),paragraph.text=paragraph.text.join(" "),results.push(paragraph),paragraph={words:[],text:[]}):(paragraph.words.push(word),paragraph.text.push(word.punct))}),results},bbcKaldiToDraft=function(bbcKaldiJson){var tmpWords,results=[],speakerSegmentation=null,wordsByParagraphs=[];return void 0===bbcKaldiJson.retval?(tmpWords=bbcKaldiJson.words,void 0!==bbcKaldiJson.segmentation&&(speakerSegmentation=bbcKaldiJson.segmentation)):(tmpWords=bbcKaldiJson.retval.words,void 0!==bbcKaldiJson.retval.segmentation&&(speakerSegmentation=bbcKaldiJson.retval.segmentation)),wordsByParagraphs=null===speakerSegmentation?groupWordsInParagraphs(tmpWords):(0,_groupWordsBySpeakers.default)(tmpWords,speakerSegmentation),wordsByParagraphs.forEach(function(paragraph,i){// if paragraph contain words // eg sometimes the speaker segmentation might not contain words :man-shrugging: if(void 0!==paragraph.words[0]){var speakerLabel="TBC ".concat(i);null!==speakerSegmentation&&(speakerLabel=paragraph.speaker);var draftJsContentBlockParagraph={text:paragraph.text,type:"paragraph",data:{speaker:speakerLabel,words:paragraph.words,start:paragraph.words[0].start},// the entities as ranges are each word in the space-joined text, // so it needs to be compute for each the offset from the beginning of the paragraph and the length entityRanges:(0,_index.default)(paragraph.words,"punct")// wordAttributeName };results.push(draftJsContentBlockParagraph)}}),results},_default=bbcKaldiToDraft;exports.default=_default;