node-diarization
Version:
Transcription and diarization using Whisper and Pyannote with NodeJS
130 lines (129 loc) • 5.19 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.unionRecognitionAndDiarization = void 0;
const unionRecognitionAndDiarization = (segments, diarization) => {
diarization = diarization.map((d) => {
d.start = parseFloat(d.start);
d.end = parseFloat(d.end);
return d;
});
// split words for more segments for better accuracy
const segmentsAfterSplit = [];
let currentResult = {
words: [],
info: {},
};
const result = [];
segments.flatMap((segment) => {
if (!segment.words) {
return [];
}
segment.words.map((wordObj) => {
currentResult.words.push(wordObj);
// set time start of segment
if (!currentResult.info.start) {
currentResult.info.start = Number(wordObj.start.toFixed(2));
}
const lastWordChar = wordObj.word.slice(-1);
// check last word of word
if (['.', '?', '!'].some(l => l === lastWordChar)) {
currentResult.info.end = Number(wordObj.end.toFixed(2));
segmentsAfterSplit.push(currentResult);
currentResult = {
words: [],
info: {},
};
}
});
});
segmentsAfterSplit.map((segment, segmentIndex) => {
if (!segment.words) {
return;
}
result[segmentIndex] = {
info: {
label: true,
text: '',
index: segmentIndex,
start: segment.info?.start,
end: segment.info?.end,
},
words: [],
speaker: '',
};
segment.words.map((wordObj) => {
const closestDiarizationSegment = findClosest(diarization, wordObj);
result[segmentIndex].words.push({
speaker: closestDiarizationSegment.speaker,
word: wordObj.word,
start: wordObj.start,
end: wordObj.end,
});
});
// check how many speakers defined in current segment,
// ideal when all words associated with 1 speaker
const checkSpeakers = result[segmentIndex].words.reduce((a, v) => {
if (!v.speaker) {
return a;
}
if (!a[v.speaker]) {
a[v.speaker] = 1;
}
else {
a[v.speaker] = a[v.speaker] + 1;
}
return a;
}, {});
// get array of speakers names (00_SPEAKER, 01_SPEAKER etc.) with counts
const speakers = Object.keys(checkSpeakers);
// define final segment speaker
let speaker = '';
// if more than one speaker in segments words
// its mean, that speaker was not defined properly
// we will try to guess by percentage of words speakers
if (speakers.length > 1) {
// count all speakers
const allSpeakers = speakers.reduce((a, v) => a + checkSpeakers[v], 0);
// get percent of each speaker in segment
const speakerPercents = speakers.reduce((a, v) => {
a.push(checkSpeakers[v] * 100 / allSpeakers).toFixed(2);
return a;
}, []);
// if one speaker percent is more than 65,
// then we consider it is good guess
const isExistsTrueResultIndex = speakerPercents.findIndex(s => s > 65);
if (isExistsTrueResultIndex > -1) {
result[segmentIndex].info.label = true;
speaker = speakers[isExistsTrueResultIndex];
}
else {
result[segmentIndex].info.label = false;
// find max count speaker
const maxIndex = Math.max(...speakerPercents);
speaker = speakers[speakerPercents.findIndex(s => s === maxIndex)];
}
result[segmentIndex].info.text = speakers.map((s, i) => `${s} ${speakerPercents[i].toFixed()}%`).join(' / ');
}
else {
// when all segments words is associated with 1 speaker
if (result[segmentIndex].words[0]) {
speaker = result[segmentIndex].words[0].speaker || 'NO_SPEAKER';
result[segmentIndex].info.text = '100%';
}
else {
result[segmentIndex].info.text = 'NO_WORDS';
}
}
result[segmentIndex].speaker = speaker;
// create text string from words
result[segmentIndex].text = (result[segmentIndex].words?.map((w) => w.word).join('') || '').trim();
});
return result;
};
exports.unionRecognitionAndDiarization = unionRecognitionAndDiarization;
// function for find closest diarization segment of word
const findClosest = (data, target) => {
const minDiff = (curr, target) => Math.min(Math.abs(curr.start - target.start), Math.abs(curr.end - target.end));
return data.reduce((prev, curr) => minDiff(curr, target) - minDiff(prev, target) < 0 ? curr : prev);
};
exports.default = exports.unionRecognitionAndDiarization;