@bbc/react-transcript-editor
Version:
A React component to make transcribing audio and video easier and faster.
36 lines • 2.89 kB
JavaScript
;Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;/**
edge cases
- more segments then words - not an issue if you start by matching words with segment
and handle edge case where it doesn't find a match
- more words then segments - orphan words
*/function groupWordsInParagraphsBySpeakers(words,segments){// add speakers to each word
var wordsWithSpeakers=addSpeakerToEachWord(words,segments.segments),result=groupWordsBySpeaker(wordsWithSpeakers);// group words by speakers sequentially
return result}/**
* Add speakers to each words
* if it doesn't have add unknown attribute `U_UKN`
* @param {*} words
* @param {*} segments
*/function addSpeakerToEachWord(words,segments){var tmpWordsWithSpeakers=[];return words.forEach(function(word){var tmpSpeakerSegment=findSegmentForWord(word,segments);word.speaker=formatSpeakerName(tmpSpeakerSegment.speaker),tmpWordsWithSpeakers.push(word)}),tmpWordsWithSpeakers}/**
* Groups Words by speaker attribute
* @param {array} wordsWithSpeakers - same as kaldi words list but with a `speaker` label attribute on each word
* @return {array} - list of paragraph objcts, with words, text and sepaker attributes.
* where words is an array and the other two are strings.
*/function groupWordsBySpeaker(wordsWithSpeakers){var currentSpeaker=wordsWithSpeakers[0].speaker,results=[],paragraph={words:[],text:"",speaker:""};return wordsWithSpeakers.forEach(function(word){currentSpeaker===word.speaker?(paragraph.words.push(word),paragraph.text+=word.punct+" ",paragraph.speaker=currentSpeaker):(currentSpeaker=word.speaker,paragraph.text=paragraph.text.trim(),results.push(paragraph),paragraph={words:[],text:"",speaker:"U_UKN"},paragraph.words.push(word),paragraph.text+=word.punct+" ")}),results.push(paragraph),results}/**
* Helper functions
*/ /**
* given word start and end time attributes
* looks for segment range that contains that word
* if it doesn't find any it returns a segment with `UKN`
* speaker attributes.
* @param {object} word - word object
* @param {array} segments - list of segments objects
* @return {object} - a single segment whose range contains the word
*/function findSegmentForWord(word,segments){var tmpSegment=segments.find(function(seg){var segEnd=seg.start+seg.duration;return word.start>=seg.start&&word.end<=segEnd});// if find doesn't find any matches it returns an undefined
return void 0===tmpSegment?{"@type":"Segment",// keeping both speaker id and gender as this is used later
// to format speaker label combining the two
speaker:{"@id":"UKN",gender:"U"}}:tmpSegment}/**
* formats kaldi speaker object into a string
* Combining Gender and speaker Id
* @param {object} speaker - BBC kaldi speaker object
* @return {string} -
*/function formatSpeakerName(speaker){return speaker.gender+"_"+speaker["@id"]}var _default=groupWordsInParagraphsBySpeakers;exports.default=_default;