@bbc/react-transcript-editor
Version:
A React component to make transcribing audio and video easier and faster.
125 lines • 3.94 kB
JavaScript
"use strict";var _index=_interopRequireDefault(require("../generate-entities-ranges/index.js"));Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;function _interopRequireDefault(obj){return obj&&obj.__esModule?obj:{default:obj}}/**
* Convert BBC Kaldi json
```
{
"action": "audio-transcribe",
"retval": {
"status": true,
"wonid": "octo:2692ea33-d595-41d8-bfd5-aa7f2d2f89ee",
"punct": "There is a day. About ten years ago when ...",
"words": [
{
"start": 13.02,
"confidence": 0.68,
"end": 13.17,
"word": "there",
"punct": "There",
"index": 0
},
{
"start": 13.17,
"confidence": 0.61,
"end": 13.38,
"word": "is",
"punct": "is",
"index": 1
},
...
```
*
* into
*
```
const blocks = [
{
"text": "There is a day.",
"type": "paragraph",
"data": {
"speaker": "TBC 0",
"words": [
{
"start": 13.02,
"confidence": 0.68,
"end": 13.17,
"word": "there",
"punct": "There",
"index": 0
},
{
"start": 13.17,
"confidence": 0.61,
"end": 13.38,
"word": "is",
"punct": "is",
"index": 1
},
{
"start": 13.38,
"confidence": 0.99,
"end": 13.44,
"word": "a",
"punct": "a",
"index": 2
},
{
"start": 13.44,
"confidence": 1,
"end": 13.86,
"word": "day",
"punct": "day.",
"index": 3
}
],
"start": 13.02
},
"entityRanges": [
{
"start": 13.02,
"end": 13.17,
"confidence": 0.68,
"text": "There",
"offset": 0,
"length": 5,
"key": "li6c6ld"
},
{
"start": 13.17,
"end": 13.38,
"confidence": 0.61,
"text": "is",
"offset": 6,
"length": 2,
"key": "pcgzkp6"
},
{
"start": 13.38,
"end": 13.44,
"confidence": 0.99,
"text": "a",
"offset": 9,
"length": 1,
"key": "ngomd9"
},
{
"start": 13.44,
"end": 13.86,
"confidence": 1,
"text": "day.",
"offset": 11,
"length": 4,
"key": "sgmfl4f"
}
]
},
...
```
*
*/ /**
* groups words list from kaldi transcript based on punctuation.
* @todo To be more accurate, should introduce an honorifics library to do the splitting of the words.
* @param {array} words - array of words opbjects from kaldi transcript
*/var groupWordsInParagraphs=function(words){var results=[],paragraph={words:[],text:[]};return words.forEach(function(word){/[.?!]/.test(word.punct)?(paragraph.words.push(word),paragraph.text.push(word.punct),results.push(paragraph),paragraph={words:[],text:[]}):(paragraph.words.push(word),paragraph.text.push(word.punct))}),results},bbcKaldiToDraft=function(bbcKaldiJson){var tmpWords,results=[];tmpWords=void 0===bbcKaldiJson.retval?bbcKaldiJson.words:bbcKaldiJson.retval.words;var wordsByParagraphs=groupWordsInParagraphs(tmpWords);return wordsByParagraphs.forEach(function(paragraph,i){var draftJsContentBlockParagraph={text:paragraph.text.join(" "),type:"paragraph",data:{speaker:"TBC ".concat(i),words:paragraph.words,start:paragraph.words[0].start},// the entities as ranges are each word in the space-joined text,
// so it needs to be compute for each the offset from the beginning of the paragraph and the length
entityRanges:(0,_index.default)(paragraph.words,"punct")// wordAttributeName
};// console.log(JSON.stringify(draftJsContentBlockParagraph,null,2))
results.push(draftJsContentBlockParagraph)}),results},_default=bbcKaldiToDraft;exports.default=_default;