UNPKG

frazy-parser

Version:

Parser for subtitles in different formats into frazy.me format. Also converter between subtitles types. (srt --> vtt etc)

github.com/aparus/frazy-parser

aparus/frazy-parser

500 lines (442 loc) • 14.7 kB

JavaScript

import matchAll from 'string.prototype.matchall'; /** * works for SRT, VTT, ASS * @param {string} timecode like `00:00:00.500` or `00:00.500` (hours is optional) * @returns {number} seconds like 1234.567 * @example * parseTime('00:00:00.500') // 0.5 */ const parseTimecode = timecode => { if (!timecode) return null; const number = Number(timecode); if (number >= 0) return number; if (typeof timecode === 'string') { const timeArray = timecode.replace(',', '.').split(':').reverse(); //reverse because 'hours' is optional and good if it at the end of array const [seconds, minutes, hours = '0'] = timeArray; const timeNumber = +seconds + +minutes * 60 + +hours * 60 * 60; return timeNumber; } }; /** * * @param {string} inputSeconds * @example * formatSecondsToTime(225) // "3:45" */ const formatSecondsToTime = inputSeconds => { let totalSeconds = +inputSeconds.toFixed(0); const hours = Math.floor(totalSeconds / 3600); const hoursString = hours ? hours + ':' : ''; totalSeconds %= 3600; const minutes = Math.floor(totalSeconds / 60); const seconds = totalSeconds % 60; const secondsString = seconds.toString().padStart(2, '0'); return `${hoursString}${minutes}:${secondsString}`; }; const findCurrentPhraseNum = (phrases, time) => { const findIndex = (array, time) => { return array.findIndex((elem, index, array) => { const { end: thisEnd } = elem; const { end: nextEnd } = array[index + 1] || Infinity; return time >= thisEnd && time <= nextEnd; }) + 1; }; const findedIndex = findIndex(phrases, time); return findedIndex ? findedIndex : phrases.length - 1; }; const extractVoiceTags = cueText => { const voiceRegex = new RegExp(/<v(\S+?)?\s+?(.+?)>([\s\S]+?)(<\/v>|$)/g); const matches = [...matchAll(cueText, voiceRegex)]; if (!matches.length) return [{ text: cueText }];else { const result = matches.reduce((prevItem, currentItem, index, array) => { const [matchString, classesString = '', name, text = ''] = currentItem; const { index: indexStartCurrent, input } = currentItem; const indexEndCurrent = indexStartCurrent + matchString.length; const classes = classesString ? classesString.trim().slice(1).split('.') : []; const { index: indexStartNext } = array[index + 1] || input.length; const curObj = [{ voice: { name, classes }, text: text.trim() }]; if (indexEndCurrent < indexStartNext) { const text = input.slice(indexEndCurrent, indexStartNext).trim(); // between voices, without voice tag, if (text) curObj.push({ text }); } return [...prevItem, ...curObj]; }, []); const textBeforeVoiceTags = () => { const { index: firstVoiceStartPosition } = matches[0]; const text = cueText.slice(0, firstVoiceStartPosition).trim(); return firstVoiceStartPosition > 0 && text ? [{ text }] : []; }; const textAfterVoiceTags = () => { const lastVoiceMatch = matches[matches.length - 1]; const { index: lastVoiceStartPosition } = lastVoiceMatch; const [matchString = ''] = lastVoiceMatch; const lastVoiceEndPosition = lastVoiceStartPosition + matchString.length; const text = cueText.slice(lastVoiceEndPosition, cueText.length).trim(); return lastVoiceEndPosition < cueText.length && text ? [{ text }] : []; }; return [...textBeforeVoiceTags(), ...result, ...textAfterVoiceTags()]; } }; const parseYamlParams = text => { const paramTemplate = /^(.+?):\s*(.+?)$/; const matches = [...text.matchAll(new RegExp(paramTemplate, 'gm'))]; if (!matches) return null; // it is not params text const paramsObject = {}; const linesArray = text ? text.split('\n') : []; let nextIntend = 0; let parentKey = ''; linesArray.forEach((line, index, linesArray) => { const getIntendOfLine = index => { return linesArray?.[index]?.match(/^\s+/)?.[0]?.length || 0; }; const intend = getIntendOfLine(index); // current line nextIntend = getIntendOfLine(index + 1) || 0; const paramMatch = line.match(paramTemplate); if (!paramMatch && parentKey) { // we have text line, and should put it to prev parent key const prevText = paramsObject[parentKey] || ''; const isNotLastLine = intend === nextIntend; const lastSymbol = isNotLastLine ? '\n' : ''; paramsObject[parentKey] = prevText + line?.trim() + lastSymbol; } let [, paramKey, paramValue] = paramMatch || []; paramKey = paramKey?.trim(); paramValue = paramValue?.trim(); if (paramKey && paramValue && !parentKey) { // whole line is a 1-st level param paramsObject[paramKey] = paramValue; } if (paramKey && paramValue && parentKey) { // array of params for parent key const prevArray = paramsObject[parentKey] || []; const newValue = { [paramKey]: paramValue }; prevArray.push(newValue); paramsObject[parentKey] = prevArray; } if (paramKey && !paramValue) { // we have a key, but haven't value => it is parent key for next lines parentKey = paramKey; } if (nextIntend < intend) { // we had parentKey on prev step, but now should clear it because of intend parentKey = ''; } }); return paramsObject; }; /** * UNIVERSAL PARSER FOR ALL KIND OF SUBTITLE FORMATS */ /* cueTemplates play 2 roles: 1) check subs type 2) extract data from it: identifier, start, end, body */ const cueTemplates = { srt: /^(\d+\s+)(\d\d:\d\d:\d\d,\d\d\d)\s+-->\s+(\d\d:\d\d:\d\d,\d\d\d)\s+([\s\S]+?)[\n\r]{2}/gm, vtt: /^(.+[\n\r])?(\d?\d?:?\d\d:\d\d\.\d\d\d)\s+-->\s+(\d?\d?:?\d\d:\d\d\.\d\d\d).*?[\n\r]([\s\S]+?)[\n\r]{2}/gm, ass: /^(Dialogue: 0,)?(\d?\d:\d\d:\d\d\.\d\d),(\d?\d:\d\d:\d\d\.\d\d)(,Default,,0,0,0,,)? ?(.+?)$/gm, audacity: /^(\d+?(\.\d+?)?)\s+?(\d+?\.?(\.\d+?)?)\s+?(.+)$/gm, // a little tricky, because floating part is optional unknown: /^(.+)$/gm }; const positionInCueTemplate = { srt: { identifier: 1, start: 2, end: 3, body: 4 }, vtt: { identifier: 1, start: 2, end: 3, body: 4 }, audacity: { start: 1, end: 3, body: 5 }, ass: { start: 2, end: 3, body: 5 }, unknown: { body: 0 } }; const checkSubsType = text => { for (const subsType in cueTemplates) { const cueTemplate = cueTemplates[subsType]; const match = text.match(cueTemplate) || []; if (match.length > 0) { return subsType; } } }; // for examples, please read /tests/subtitles/ const parseSubs = (text, extractVoices = true) => { const subsType = checkSubsType(text); const indexes = positionInCueTemplate[subsType]; const arrayOfMatches = [...matchAll(text + '\n\n', cueTemplates[subsType])]; const subsObject = arrayOfMatches.map((elem, index) => { // difference between id/identifier: identifier in vtt can be any word, it used for styling in css // id is order number const id = index + 1; const identifier = indexes.identifier && elem[indexes.identifier] ? elem[indexes.identifier].trim() : ''; const start = parseTimecode(elem[indexes.start]); const end = parseTimecode(elem[indexes.end]); const body = extractVoices ? extractVoiceTags(elem[indexes.body]) : elem[indexes.body]; const currentSub = { id, identifier, start, end, body }; if (!identifier) delete currentSub.identifier; if (!start && start !== 0) delete currentSub.start; if (!end && end !== 0) delete currentSub.end; return currentSub; }); return subsObject; }; // *** VTT ADVANCED PARSER (with INFO, CHAPTERS, COMMENTS, etc) *** // const vttInfoTemplate = /^webvtt *-? *(.+)*[\n\r]([\s\S]+?)[\n\r]{2}/gim; // first block of document const vttChapterTemplate = /^note chapter\s*[\n\r]([\s\S]+?)[\n\r]{2}/gim; const vttCommentTemplate = /^note comment\s*[\n\r]([\s\S]+?)[\n\r]{2}/gim; const parseVttCueMatchElem = (elem, id, extractVoices = true) => { const indexes = { identifier: 1, start: 2, end: 3, body: 4 }; const identifier = indexes.identifier && elem[indexes.identifier] ? elem[indexes.identifier].trim() : ''; const start = parseTimecode(elem[indexes.start]); const end = parseTimecode(elem[indexes.end]); const body = extractVoices ? extractVoiceTags(elem[indexes.body]) : elem[indexes.body]; const currentSub = { id, identifier, start, end, body }; if (!identifier) delete currentSub.identifier; if (!start && start !== 0) delete currentSub.start; if (!end && end !== 0) delete currentSub.end; return currentSub; }; /** * * @param {*} matchElem - is array after text.match * @returns object */ const parseCommentMatchElem = matchElem => { const [, text] = matchElem; return { text }; }; const parseInfoMatchElem = matchElem => { const [, topTitle, text] = matchElem; return { topTitle, ...parseYamlParams(text) }; }; /** * * @param {*} matchElem - is array after text.match * @returns object */ const parseChapterMatchElem = matchElem => { const [, chapterText] = matchElem; const parsedChapter = parseYamlParams(chapterText); let { start, end } = parsedChapter; start = parseTimecode(start); end = parseTimecode(end); if (start) parsedChapter.start = start; if (end) parsedChapter.end = end; return { ...parsedChapter }; }; // deprecated, use parseVtt.filter type === 'chapter' // vtt supports comments, that is lines begin from NOTE // we use them to put chapters info inside subs file // for examples look at tests/subtitles/vttParseChapters.test.js const parseChapters = text => { const chaptersMatch = [...text.matchAll(vttChapterTemplate)]; const chapters = chaptersMatch.map(elem => parseChapterMatchElem(elem)); return chapters; }; /** * Advanced parser, to extract not only phrases, but also info, chapters, and comments */ const parseVtt = text => { const textArray = text.trim().split(/[\n\r]{2}/); // all regex templates are 'gmi' // global => then we use matchAll // multiline => then we add '\n\n' let cueId = 0; return textArray.map(elem => { let [matchElem] = [...matchAll(elem + '\n\n', vttInfoTemplate)]; if (matchElem) { return { type: 'info', ...parseInfoMatchElem(matchElem) }; } [matchElem] = [...matchAll(elem + '\n\n', vttChapterTemplate)]; if (matchElem) { return { type: 'chapter', ...parseChapterMatchElem(matchElem) }; } [matchElem] = [...matchAll(elem + '\n\n', cueTemplates.vtt)]; if (matchElem) { cueId++; return { type: 'cue', ...parseVttCueMatchElem(matchElem, cueId) }; } [matchElem] = [...matchAll(elem + '\n\n', vttCommentTemplate)]; if (matchElem) return { type: 'comment', ...parseCommentMatchElem(matchElem) }; //else return { type: 'unknown', text: elem }; }); }; const getRegexIndexes = (text, regex, label) => { return [...matchAll(text, regex)].map(elem => { const [outerText] = elem; const { index: startIndex } = elem; const endIndex = startIndex + outerText.length; return { label, indexes: [startIndex, endIndex] }; }); }; /** * categorize every peace of text by regex patterns * stick a label (its type) to each part of text * @param {string} textInput * @param {object[]} patterns // [ { label, regex, parser, replacers } ] * @param {string} pattern.label - name for particular block of text * @param {string} pattern.regex - regular expression that describes block of text * @param {function} pattern.parser - function (text){ returns object or array } * @param {function[]} pattern.replacers - array of functions(text){returns text after replacement} * @param {string} defaultLabel // for blocks without labels (not match regex) * @returns {object[]} [ { label, indexes, text, data } ] */ const parseText = (textInput, patterns, defaultLabel = 'uncategorized' // display = [ 'indexes', 'text', 'data' ] ) => { let array = patterns.reduce((prev, item) => { const { regex, label } = item; const indexes = getRegexIndexes(textInput, regex, label); if (label && label !== defaultLabel) { return [...prev, ...indexes]; } else { return [...prev]; } }, []); // categorized indexes const categorizedIndexes = array.sort((a, b) => a['indexes'][0] - b['indexes'][0]); const uncategorizedIndexes = // if there is no categorizedIndexes, all text is uncategorized categorizedIndexes.length === 0 ? [{ label: defaultLabel, indexes: [0, textInput.length] }] : //looking for gaps between categorizedIndexes and collect them as 'uncategorized' categorizedIndexes.reduce((prev, currentItem, index, array) => { const [, currentItemEnd] = currentItem.indexes; const [nextItemStart] = array[index + 1] ? array[index + 1].indexes : [textInput.length]; if (currentItemEnd + 1 !== nextItemStart && index < array.length) { return [...prev, { label: defaultLabel, indexes: [currentItemEnd, nextItemStart] }]; } else { return [...prev]; } }, []); // find zero element, witch can be lost while reduce array = [...categorizedIndexes, ...uncategorizedIndexes].sort((a, b) => a['indexes'][0] - b['indexes'][0]); const [firstIndex] = array[0].indexes; if (firstIndex > 0) { const zeroElement = { label: defaultLabel, indexes: [0, firstIndex] }; array.unshift(zeroElement); } const [, lastIndex] = array[array.length - 1].indexes; if (lastIndex < textInput.length) { const lastElement = { label: defaultLabel, indexes: [lastIndex, textInput.length] }; array.push(lastElement); } // ===== return ===== return array.map(elem => { const { label } = elem; const [startIndex, endIndex] = elem ? elem.indexes : []; const { parser, replacers = [] } = patterns.find(elem => elem.label === label) || {}; let text = textInput.slice(startIndex, endIndex); replacers.forEach(replacer => { text = replacer ? replacer(text) : text; }); const data = parser ? parser(text) : null; return { ...elem, text, data }; }); }; // regexes export { checkSubsType, findCurrentPhraseNum, formatSecondsToTime, parseChapters, parseSubs, parseText, parseTimecode, parseVtt, parseYamlParams };