UNPKG

usfm-grammar

Version:

Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM

210 lines (181 loc) 6.74 kB
const MARKERS_WITH_DISCARDABLE_CONTENTS = [ 'ide', 'usfm', 'h', 'toc', 'toca', 'imt', 'is', 'ip', 'ipi', 'im', 'imi', 'ipq', 'imq', 'ipr', 'iq', 'ib', 'ili', 'iot', 'io', 'iex', 'imte', 'ie', 'mt', 'mte', 'cl', 'cd', 'ms', 'mr', 's', 'sr', 'r', 'd', 'sp', 'sd', 'sts', 'rem', 'lit', 'restore', 'f', 'fe', 'ef', 'efe', 'x', 'ex', 'fr', 'ft', 'fk', 'fq', 'fqa', 'fl', 'fw', 'fp', 'fv', 'fdc', 'xo', 'xop', 'xt', 'xta', 'xk', 'xq', 'xot', 'xnt', 'xdc', 'jmp', 'fig', 'cat', 'esb', 'b', ]; const trailingNumPattern = /\d+$/; const punctPatternNoSpaceBefore = /^[,.\-—/;:!?@$%^)}\]>”»]/; const punctPatternNoSpaceAfter = /[\-—/`@^&({[<“«]$/; function combineConsecutiveTextContents(contentsList) { const textCombinedContents = []; let textContents = ''; contentsList.forEach(item => { if (typeof item === 'string') { if (!(textContents.endsWith(' ') || item.startsWith(' ') || textContents === '' || punctPatternNoSpaceBefore.test(item) || punctPatternNoSpaceAfter.test(textContents))) { textContents += ' '; } textContents += item; } else { if (textContents !== '') { textCombinedContents.push(textContents); textContents = ''; } textCombinedContents.push(item); } }); if (textContents !== '') { textCombinedContents.push(textContents); } return textCombinedContents; } function excludeMarkersInUsj( inputUsj, excludeMarkers, combineTexts = true, excludedParent = false) { let cleanedKids = []; if (typeof inputUsj === 'string') { if (excludedParent && excludeMarkers.includes('text-in-excluded-parent')) { return []; } return [inputUsj]; } let thisMarker = ''; if ('marker' in inputUsj) { thisMarker = inputUsj.marker.replace(trailingNumPattern, ''); } else if (inputUsj.type === 'ref') { thisMarker = 'ref'; } let thisMarkerNeeded = true; let innerContentNeeded = true; excludedParent = false; if (excludeMarkers.includes(thisMarker)) { thisMarkerNeeded = false; excludedParent = true; if (MARKERS_WITH_DISCARDABLE_CONTENTS.includes(thisMarker)) { innerContentNeeded = false; } } if ((thisMarkerNeeded || innerContentNeeded) && 'content' in inputUsj) { inputUsj.content.forEach(item => { const cleaned = excludeMarkersInUsj(item, excludeMarkers, combineTexts, excludedParent); if (Array.isArray(cleaned)) { cleanedKids.push(...cleaned); } else { cleanedKids.push(cleaned); } }); if (combineTexts) { cleanedKids = combineConsecutiveTextContents(cleanedKids); } } if (thisMarkerNeeded) { inputUsj.content = cleanedKids; return inputUsj; } if (innerContentNeeded) { return cleanedKids; } return []; } function includeMarkersInUsj( inputUsj, includeMarkers, combineTexts = true, excludedParent = false) { let cleanedKids = []; if (typeof inputUsj === 'string') { if (excludedParent && !includeMarkers.includes('text-in-excluded-parent')) { return []; } return [inputUsj]; } let thisMarker = ''; if ('marker' in inputUsj) { thisMarker = inputUsj.marker.replace(trailingNumPattern, ''); } else if (inputUsj.type === 'ref') { thisMarker = 'ref'; } const thisMarkerNeeded = includeMarkers.includes(thisMarker) || thisMarker === ''; const innerContentNeeded = (thisMarkerNeeded || !MARKERS_WITH_DISCARDABLE_CONTENTS.includes(thisMarker)); if (innerContentNeeded && 'content' in inputUsj) { inputUsj.content.forEach(item => { const cleaned = includeMarkersInUsj(item, includeMarkers, combineTexts, !thisMarkerNeeded); if (Array.isArray(cleaned)) { cleanedKids.push(...cleaned); } else { cleanedKids.push(cleaned); } }); if (combineTexts) { cleanedKids = combineConsecutiveTextContents(cleanedKids); } } if (thisMarker === 'c') { if (!includeMarkers.includes('ca')) { delete inputUsj.altnumber; } if (!includeMarkers.includes('cp')) { delete inputUsj.pubnumber; } } else if (thisMarker === 'v') { if (!includeMarkers.includes('va')) { delete inputUsj.altnumber; } if (!includeMarkers.includes('vp')) { delete inputUsj.pubnumber; } } if (thisMarkerNeeded) { inputUsj.content = cleanedKids; return inputUsj; } if (innerContentNeeded) { return cleanedKids; } return []; } class Filter { // Defines the values of filter options static BOOK_HEADERS = [ 'ide', 'usfm', 'h', 'toc', 'toca', // identification 'imt', 'is', 'ip', 'ipi', 'im', 'imi', 'ipq', 'imq', 'ipr', 'iq', 'ib', 'ili', 'iot', 'io', 'iex', 'imte', 'ie', // intro ]; static TITLES = [ 'mt', 'mte', 'cl', 'cd', 'ms', 'mr', 's', 'sr', 'r', 'd', 'sp', 'sd', // headings ]; static COMMENTS = ['sts', 'rem', 'lit', 'restore']; // comment markers static PARAGRAPHS = [ 'p', 'm', 'po', 'pr', 'cls', 'pmo', 'pm', 'pmc', // paragraphs-quotes-lists-tables 'pmr', 'pi', 'mi', 'nb', 'pc', 'ph', 'q', 'qr', 'qc', 'qa', 'qm', 'qd', 'lh', 'li', 'lf', 'lim', 'litl', 'tr', 'tc', 'th', 'tcr', 'thr', 'table', 'b', ]; static CHARACTERS = [ 'add', 'bk', 'dc', 'ior', 'iqt', 'k', 'litl', 'nd', 'ord', 'pn', 'png', 'qac', 'qs', 'qt', 'rq', 'sig', 'sls', 'tl', 'wj', // Special-text 'em', 'bd', 'bdit', 'it', 'no', 'sc', 'sup', // character styling 'rb', 'pro', 'w', 'wh', 'wa', 'wg', // special-features 'lik', 'liv', // structured list entries 'jmp', ]; static NOTES = [ 'f', 'fe', 'ef', 'efe', 'x', 'ex', // footnotes-and-crossrefs 'fr', 'ft', 'fk', 'fq', 'fqa', 'fl', 'fw', 'fp', 'fv', 'fdc', 'xo', 'xop', 'xt', 'xta', 'xk', 'xq', 'xot', 'xnt', 'xdc', ]; static STUDY_BIBLE = ['esb', 'cat']; // sidebars-extended-contents static BCV = ['id', 'c', 'v']; static TEXT = ['text-in-excluded-parent', 'text']; static keepOnly(inputUsj, includeMarkers, combineTexts = true) { // let flattenedList = [].concat(...includeMarkers); const cleanedMarkers = includeMarkers.map(marker => marker.replace(trailingNumPattern, '')); const filteredUSJ = includeMarkersInUsj(inputUsj, cleanedMarkers, combineTexts); return filteredUSJ; } static remove(inputUsj, excludeMarkers, combineTexts = true) { // let flattenedList = [].concat(...excludeMarkers); const cleanedMarkers = excludeMarkers.map(marker => marker.replace(trailingNumPattern, '')); const filteredUSJ = excludeMarkersInUsj(inputUsj, cleanedMarkers, combineTexts); return filteredUSJ; } } exports.excludeMarkersInUsj = excludeMarkersInUsj; exports.includeMarkersInUsj = includeMarkersInUsj; exports.Filter = Filter;