UNPKG

@rpidanny/pdf2md

Version:

A PDF to Markdown Converter

github.com/rpidanny/pdf2md

rpidanny/pdf2md

117 lines (102 loc) • 3.51 kB

JavaScript

const MIN_DIGIT_CHAR_CODE = 48 const MAX_DIGIT_CHAR_CODE = 57 const WHITESPACE_CHAR_CODE = 32 const TAB_CHAR_CODE = 9 const DOT_CHAR_CODE = 46 exports.removeLeadingWhitespaces = function removeLeadingWhitespaces (string) { while (string.charCodeAt(0) === WHITESPACE_CHAR_CODE) { string = string.substring(1, string.length) } return string } exports.removeTrailingWhitespaces = function removeTrailingWhitespaces (string) { while (string.charCodeAt(string.length - 1) === WHITESPACE_CHAR_CODE) { string = string.substring(0, string.length - 1) } return string } exports.isDigit = function isDigit (charCode) { return charCode >= MIN_DIGIT_CHAR_CODE && charCode <= MAX_DIGIT_CHAR_CODE } exports.isNumber = function isNumber (string) { for (var i = 0; i < string.length; i++) { const charCode = string.charCodeAt(i) if (!exports.isDigit(charCode)) { return false } } return true } exports.hasOnly = function hasOnly (string, char) { const charCode = char.charCodeAt(0) for (var i = 0; i < string.length; i++) { const aCharCode = string.charCodeAt(i) if (aCharCode !== charCode) { return false } } return true } exports.hasUpperCaseCharacterInMiddleOfWord = function hasUpperCaseCharacterInMiddleOfWord (text) { var beginningOfWord = true for (var i = 0; i < text.length; i++) { const character = text.charAt(i) if (character === ' ') { beginningOfWord = true } else { if (!beginningOfWord && isNaN(character * 1) && character === character.toUpperCase() && character.toUpperCase() !== character.toLowerCase()) { return true } beginningOfWord = false } } return false } // Remove whitespace/dots + to uppercase exports.normalizedCharCodeArray = function normalizedCharCodeArray (string) { string = string.toUpperCase() return exports.charCodeArray(string).filter(charCode => charCode !== WHITESPACE_CHAR_CODE && charCode !== TAB_CHAR_CODE && charCode !== DOT_CHAR_CODE) } exports.charCodeArray = function charCodeArray (string) { const charCodes = [] for (var i = 0; i < string.length; i++) { charCodes.push(string.charCodeAt(i)) } return charCodes } exports.prefixAfterWhitespace = function prefixAfterWhitespace (prefix, string) { if (string.charCodeAt(0) === WHITESPACE_CHAR_CODE) { string = exports.removeLeadingWhitespaces(string) return ' ' + prefix + string } else { return prefix + string } } exports.suffixBeforeWhitespace = function suffixBeforeWhitespace (string, suffix) { if (string.charCodeAt(string.length - 1) === WHITESPACE_CHAR_CODE) { string = exports.removeTrailingWhitespaces(string) return string + suffix + ' ' } else { return string + suffix } } exports.isListItemCharacter = function isListItemCharacter (string) { if (string.length > 1) { return false } const char = string.charAt(0) return char === '-' || char === '•' || char === '–' } exports.isListItem = function isListItem (string) { return /^[\s]*[-•–][\s].*$/g.test(string) } exports.isNumberedListItem = function isNumberedListItem (string) { return /^[\s]*[\d]*[.][\s].*$/g.test(string) } exports.wordMatch = function wordMatch (string1, string2) { const words1 = new Set(string1.toUpperCase().split(' ')) const words2 = new Set(string2.toUpperCase().split(' ')) const intersection = new Set( [...words1].filter(x => words2.has(x))) return intersection.size / Math.max(words1.size, words2.size) }