@hyperaudio/transcript-parser
Version:
Hyperaudio HTML transcript to JSON converter
112 lines (90 loc) • 3.34 kB
JavaScript
import htmlparser from 'htmlparser2';
export default (function (html) {
var offsets = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;
var digits = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 2;
return new Promise(function (resolve, reject) {
var json = {
words: [],
paragraphs: [],
speakers: []
};
if (offsets) json.transcript = '';
var currentWord = {};
var currentParagraph = void 0;
var parser = new htmlparser.Parser({
onopentag: function onopentag(name, attrs) {
var word = {};
if (name === 'p') {
currentParagraph = {};
if (offsets) currentParagraph.startOffset = json.transcript.length;
json.paragraphs.push(currentParagraph);
// if (attrs[`data-tc`]) {
// const [hh, mm, ss] = attrs[`data-tc`].split(`:`);
// currentParagraph.start = parseInt(hh) * 3600 + parseInt(mm) * 60 + parseInt(ss);
// }
}
if (attrs['data-m']) {
word.start = parseInt(attrs['data-m']) / 1e3;
if (attrs['data-d']) word.end = word.start + parseInt(attrs['data-d']) / 1e3;
}
if (attrs['data-t']) {
var _attrs$$split = attrs['data-t'].split(','),
start = _attrs$$split[0],
duration = _attrs$$split[1];
word.start = parseFloat(start);
if (!isNaN(parseFloat(duration))) word.end = word.start + parseFloat(duration);
}
if (attrs['class'] && attrs['class'] === 'speaker') {
word.speaker = true;
}
if (word.start) word.start = parseFloat(word.start.toFixed(digits));
if (word.end) word.end = parseFloat(word.end.toFixed(digits));
if (currentParagraph) {
if (!currentParagraph.start && word.start) {
currentParagraph.start = word.start;
}
if (word.end) currentParagraph.end = word.end;
}
json.words.push(word);
currentWord = word;
},
ontext: function ontext(text) {
var word = currentWord;
if (word.speaker) {
currentParagraph.speaker = text.trim().replace(/[:|\[|\]]/g, '');
if (!json.speakers.includes(currentParagraph.speaker)) json.speakers.push(currentParagraph.speaker);
}
word.text = text.trim();
if (offsets) {
word.startOffset = json.transcript.length;
json.transcript += text;
word.endOffset = json.transcript.length;
}
if (json.words.length === 0 || word !== json.words[json.words.length - 1]) json.words.push(word);
},
onclosetag: function onclosetag(name) {
if (name === 'p') {
if (offsets) currentParagraph.endOffset = json.transcript.length;
currentParagraph = null;
}
currentWord = {};
},
onend: function onend() {
json.words = json.words.filter(function (word) {
return !!word.text;
});
resolve(json);
},
onerror: function onerror(error) {
return reject(error);
}
}, {
decodeEntities: true,
lowerCaseTags: true,
lowerCaseAttributeNames: true,
recognizeSelfClosing: true
});
parser.write(html);
parser.end();
});
});