UNPKG

@bbc/react-transcript-editor

Version:

A React component to make transcribing audio and video easier and faster.

36 lines 2.89 kB
"use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;/** edge cases - more segments then words - not an issue if you start by matching words with segment and handle edge case where it doesn't find a match - more words then segments - orphan words */function groupWordsInParagraphsBySpeakers(words,segments){// add speakers to each word var wordsWithSpeakers=addSpeakerToEachWord(words,segments.segments),result=groupWordsBySpeaker(wordsWithSpeakers);// group words by speakers sequentially return result}/** * Add speakers to each words * if it doesn't have add unknown attribute `U_UKN` * @param {*} words * @param {*} segments */function addSpeakerToEachWord(words,segments){var tmpWordsWithSpeakers=[];return words.forEach(function(word){var tmpSpeakerSegment=findSegmentForWord(word,segments);word.speaker=formatSpeakerName(tmpSpeakerSegment.speaker),tmpWordsWithSpeakers.push(word)}),tmpWordsWithSpeakers}/** * Groups Words by speaker attribute * @param {array} wordsWithSpeakers - same as kaldi words list but with a `speaker` label attribute on each word * @return {array} - list of paragraph objcts, with words, text and sepaker attributes. * where words is an array and the other two are strings. */function groupWordsBySpeaker(wordsWithSpeakers){var currentSpeaker=wordsWithSpeakers[0].speaker,results=[],paragraph={words:[],text:"",speaker:""};return wordsWithSpeakers.forEach(function(word){currentSpeaker===word.speaker?(paragraph.words.push(word),paragraph.text+=word.punct+" ",paragraph.speaker=currentSpeaker):(currentSpeaker=word.speaker,paragraph.text=paragraph.text.trim(),results.push(paragraph),paragraph={words:[],text:"",speaker:"U_UKN"},paragraph.words.push(word),paragraph.text+=word.punct+" ")}),results.push(paragraph),results}/** * Helper functions */ /** * given word start and end time attributes * looks for segment range that contains that word * if it doesn't find any it returns a segment with `UKN` * speaker attributes. * @param {object} word - word object * @param {array} segments - list of segments objects * @return {object} - a single segment whose range contains the word */function findSegmentForWord(word,segments){var tmpSegment=segments.find(function(seg){var segEnd=seg.start+seg.duration;return word.start>=seg.start&&word.end<=segEnd});// if find doesn't find any matches it returns an undefined return void 0===tmpSegment?{"@type":"Segment",// keeping both speaker id and gender as this is used later // to format speaker label combining the two speaker:{"@id":"UKN",gender:"U"}}:tmpSegment}/** * formats kaldi speaker object into a string * Combining Gender and speaker Id * @param {object} speaker - BBC kaldi speaker object * @return {string} - */function formatSpeakerName(speaker){return speaker.gender+"_"+speaker["@id"]}var _default=groupWordsInParagraphsBySpeakers;exports.default=_default;