UNPKG

@hyperaudio/transcript-parser

Version:
122 lines (95 loc) 3.59 kB
'use strict'; exports.__esModule = true; var _htmlparser = require('htmlparser2'); var _htmlparser2 = _interopRequireDefault(_htmlparser); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } exports.default = function (html) { var offsets = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false; var digits = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 2; return new Promise(function (resolve, reject) { var json = { words: [], paragraphs: [], speakers: [] }; if (offsets) json.transcript = ''; var currentWord = {}; var currentParagraph = void 0; var parser = new _htmlparser2.default.Parser({ onopentag: function onopentag(name, attrs) { var word = {}; if (name === 'p') { currentParagraph = {}; if (offsets) currentParagraph.startOffset = json.transcript.length; json.paragraphs.push(currentParagraph); // if (attrs[`data-tc`]) { // const [hh, mm, ss] = attrs[`data-tc`].split(`:`); // currentParagraph.start = parseInt(hh) * 3600 + parseInt(mm) * 60 + parseInt(ss); // } } if (attrs['data-m']) { word.start = parseInt(attrs['data-m']) / 1e3; if (attrs['data-d']) word.end = word.start + parseInt(attrs['data-d']) / 1e3; } if (attrs['data-t']) { var _attrs$$split = attrs['data-t'].split(','), start = _attrs$$split[0], duration = _attrs$$split[1]; word.start = parseFloat(start); if (!isNaN(parseFloat(duration))) word.end = word.start + parseFloat(duration); } if (attrs['class'] && attrs['class'] === 'speaker') { word.speaker = true; } if (word.start) word.start = parseFloat(word.start.toFixed(digits)); if (word.end) word.end = parseFloat(word.end.toFixed(digits)); if (currentParagraph) { if (!currentParagraph.start && word.start) { currentParagraph.start = word.start; } if (word.end) currentParagraph.end = word.end; } json.words.push(word); currentWord = word; }, ontext: function ontext(text) { var word = currentWord; if (word.speaker) { currentParagraph.speaker = text.trim().replace(/[:|\[|\]]/g, ''); if (!json.speakers.includes(currentParagraph.speaker)) json.speakers.push(currentParagraph.speaker); } word.text = text.trim(); if (offsets) { word.startOffset = json.transcript.length; json.transcript += text; word.endOffset = json.transcript.length; } if (json.words.length === 0 || word !== json.words[json.words.length - 1]) json.words.push(word); }, onclosetag: function onclosetag(name) { if (name === 'p') { if (offsets) currentParagraph.endOffset = json.transcript.length; currentParagraph = null; } currentWord = {}; }, onend: function onend() { json.words = json.words.filter(function (word) { return !!word.text; }); resolve(json); }, onerror: function onerror(error) { return reject(error); } }, { decodeEntities: true, lowerCaseTags: true, lowerCaseAttributeNames: true, recognizeSelfClosing: true }); parser.write(html); parser.end(); }); }; module.exports = exports['default'];