UNPKG

himarc

Version:

another marc21 analyzer in text format

572 lines (527 loc) 19.1 kB
// MARCMaker Specifications : https://www.loc.gov/marc/makrbrkr.html#what-is-marc const allJsonSchema = require('json-schema-himarc'); module.exports = { mrcToObject, mrkToObject, tokenizer, syntaxAnalyzer, toHimarc, toHTML }; /** * The mrcToObject function takes the marc21 format, tokenize it, parse it and transform it into a javascript object * @param {string} data marc21 format * @param {array} filterTag filter data only for input tag * @returns {Object} */ function mrcToObject (data, filterTag = []) { const tokens = data.split(String.fromCharCode(0x1E)); const leader = getLeaderFrom(tokens); const directory = getDirectoryFrom(tokens); const directoryEntries = getDirectoryEntriesFrom(directory); const variableFields = getVariableFieldsFrom(tokens); const fields = directoryEntries .map((entry, index) => { const variableField = variableFields[index]; return { entry, variableField }; }) .filter(({ entry, variableField }) => filterTag.length === 0 || filterTag.includes(entry.tag)) .map(({ entry, variableField }) => { const field = {}; if (entry.tag.startsWith('00')) { field[entry.tag] = variableField; } else { const fieldTag = field[entry.tag] = {}; const dataFieldTokens = variableField.split(String.fromCharCode(0x1F)); fieldTag.indicator1 = getIndicator1From(dataFieldTokens); fieldTag.indicator2 = getIndicator2From(dataFieldTokens); fieldTag.subFields = getSubFieldFrom(dataFieldTokens); } return field; }) .reduce((previous, current) => { const tag = Object.keys(current)[0]; if (isFieldRepeatable(tag)) { if (!(tag in previous)) previous[tag] = []; previous[tag].push(current[tag]); return previous; } else { return Object.assign(previous, current); } }, {}); if (filterTag.length === 0 || filterTag.includes('LDR')) fields.LDR = leader; if ((filterTag.length === 0 || filterTag.includes('007')) && '007' in fields) { fields['007'] = fields['007'].map(value => formatField007(value)); } if (filterTag.length === 0 || filterTag.includes('008')) { const fields008 = fields['008'] || ''; fields['008'] = formatField008(fields008, leader.positions['06'], leader.positions['07']); } return fields; } /** * The mrkToObject function takes the raw marc21 text, tokenize it, parse it and transform it into a javascript object * @param {string} data raw marc21 text * @returns {Object} */ function mrkToObject (data) { const result = toHimarc(syntaxAnalyzer(tokenizer(data))); return result; } /** * The tokenizer function takes the raw marc21 text and splits it apart into tokens. Tokens are an array of tiny little * objects that describe an isolated piece of the syntax. * @param {string} input raw marc21 text * @returns {Array} */ function tokenizer (input) { const WHITESPACE = /\s/; const EOL = /\n/; const START_FIELD = /=/; const SUBFIELD_CODE_DELIMITER = /\$/; const tokens = []; let position = 0; while (position < input.length) { let char = input[position]; let beforeChar = input[position - 1]; if (START_FIELD.test(char)) { tokens.push({ type: 'startField', value: char, startPosition: position }); char = input[++position]; let value = ''; const startPosition = position; while (!(WHITESPACE.test(char) || EOL.test(char) || position >= input.length)) { value += char; char = input[++position]; } tokens.push({ type: 'data', value: value.trim(), startPosition }); continue; } if (EOL.test(char)) { tokens.push({ type: 'eol', value: char, startPosition: position }); char = input[++position]; continue; } if (WHITESPACE.test(char)) { tokens.push({ type: 'whitespace', value: char, startPosition: position }); char = input[++position]; continue; } if (SUBFIELD_CODE_DELIMITER.test(char) && beforeChar !== '/') { tokens.push({ type: 'subFieldCodeDelimiter', value: char, startPosition: position }); char = input[++position]; continue; } let value = ''; const startPosition = position; while ( !EOL.test(char) && !(SUBFIELD_CODE_DELIMITER.test(char) && beforeChar !== '/') && !(position > (input.length - 1)) ) { value += char; char = input[++position]; beforeChar = input[position - 1]; } tokens.push({ type: 'data', value: value, startPosition }); } return tokens; } /** * The syntaxAnalyzer function takes the tokens and reformats them into a representation that describes each part of the * syntax and their relation to one another. * @param {Array} tokens tokens from the tokenizer function * @returns {Array} */ function syntaxAnalyzer (tokens) { const errors = []; const data = tokens.map((token, index) => { if (token.type !== 'data') return token; const previousToken = getPreviousToken(tokens, index) || { type: null }; if (previousToken.type === 'startField') { token.type = 'tag'; const TAG = /^([0-9]{3}|LDR)$/; if (!TAG.test(token.value)) { token.message = 'tag field is invalid'; errors.push(token); } return token; } if (previousToken.type === 'tag' && (previousToken.value.startsWith('00') || previousToken.value === 'LDR')) { token.type = 'controlFieldInfo'; return token; } if (previousToken.type === 'tag') { token.type = 'indicators'; const INDICATOR = /^[0-9\\]{2}$/; if (!INDICATOR.test(token.value)) { token.message = 'Indicators must have two characters in every variable data field'; errors.push(token); } return token; } if (previousToken.type === 'subFieldCodeDelimiter') { const subFieldCode = { type: 'subFieldCode', value: token.value[0], startPosition: token.startPosition }; const ALPHANUMERIC = /[a-z0-9]/; if (!ALPHANUMERIC.test(subFieldCode.value)) subFieldCode.message = 'subField code must be a lowercase alphabetic or numeric character'; const subFieldInfo = { type: 'subFieldInfo', startPosition: token.startPosition + 1 }; if (token.value.length > 1) { subFieldInfo.value = token.value.slice(1); } else { subFieldInfo.value = ''; subFieldInfo.message = 'data element is empty'; } token.type = 'dataFieldInfo'; token.value = [subFieldCode, subFieldInfo]; if ('error' in subFieldCode || 'error' in subFieldInfo) { errors.push(token); } return token; } token.type = 'unknown'; errors.push(token); return token; }); return { data, errors }; } /** * The toHimarc transform function takes the the tokens after the syntax analysis step and builds a representation of the Marc21 * data into a javascript object * @param {Object} result Object from the syntaxAnalyzer function * @returns {Object} */ function toHimarc (result) { const errors = result.errors || []; const cache = []; const fields = result.data.map((token, index) => (token.type === 'startField') ? index : null) .filter(indice => indice !== null) .reduce((accumulator, currentValue, index, arr) => { if (index + 2 <= arr.length) accumulator.push(arr.slice(index, index + 2)); if (index === (arr.length - 1)) accumulator.push([currentValue, result.data.length]); return accumulator; }, []) .map(fieldIndice => { return result.data.slice(...fieldIndice); }) .map(fieldsInfo => { return fieldsInfo.reduce((accumulator, current) => { if (current.type === 'tag') accumulator[current.type] = current.value; if (current.type === 'controlFieldInfo') accumulator.value = current.value; if (current.type === 'indicators') { accumulator.indicator1 = current.value.charAt(0); accumulator.indicator2 = current.value.charAt(1); } if (current.type === 'dataFieldInfo') { const subField = current.value.reduce((accumulator, current) => { accumulator[current.type] = current.value; return accumulator; }, {}); if (!('subFields' in accumulator)) accumulator.subFields = []; accumulator.subFields.push({ [subField.subFieldCode]: subField.subFieldInfo.trim() }); } return accumulator; }, {}); }) .map(field => { if (!('subFields' in field) && !field.tag.startsWith('00')) field.subFields = []; if (field.tag === 'LDR') field.value = formatLeader(field.value); if (field.tag === '007') field.value = formatField007(field.value); return field; }) .map((field, index, arr) => { if (field.tag === '008') { const leader = arr.filter(item => item.tag === 'LDR'); if (leader.length > 0) { const leaderValue = leader[0].value; field.value = formatField008(field.value, leaderValue.positions['06'], leaderValue.positions['07']); } else { errors.push({ type: 'field', message: 'the leader is missing for the transformation of field 008' }); } } return field; }) .reduce((accumulator, current) => { const value = ('value' in current) ? current.value : Object.keys(current).reduce((acc, key) => { if (key !== 'tag') acc[key] = current[key]; return acc; }, {}); if (isFieldRepeatable(current.tag)) { if (current.tag in accumulator) { accumulator[current.tag].push(value); } else { accumulator[current.tag] = [value]; } } else { if (cache.includes(current.tag)) { errors.push({ type: 'field', value: current.tag, message: "field is repeated when it shouldn't" }); } else { cache.push(current.tag); } accumulator[current.tag] = value; } return accumulator; }, {}); return { fields, errors }; } /** * The toHTML transform function takes the the tokens after the syntax analysis step and builds a representation of the Marc21 * data into HTML * @param {Object} result Object from the syntaxAnalyzer function * @returns {Object} */ function toHTML (parsedContent) { return parsedContent.reduce((accumulator, current) => { if (['whitespace', 'eol'].includes(current.type)) { accumulator += current.value; return accumulator; } if (Array.isArray(current.value)) { const value = current.value.map(item => { if (item.type === 'subFieldCode') { return `<span class="${item.type}">${item.value}</span>`; } else { return item.value; } }).join(''); accumulator += value; return accumulator; } accumulator += `<span class="${current.type}">${current.value}</span>`; return accumulator; }, ''); } function formatLeader (leader) { return { positions: { '00-04': leader.slice(0, 5), '05': leader.charAt(5), '06': leader.charAt(6), '07': leader.charAt(7), '08': leader.charAt(8), '09': leader.charAt(9), 10: leader.charAt(10), 11: leader.charAt(11), '12-16': leader.slice(12, 17), 17: leader.charAt(17), 18: leader.charAt(18), 19: leader.charAt(19), 20: leader.charAt(20), 21: leader.charAt(21), 22: leader.charAt(22), 23: leader.charAt(23) } }; } function formatField007 (value) { let fieldInfos; const categoryOfMaterial = value.split('')[0]; const initFieldInfos = (data, size) => { return data.padStart(size).split('').map((value, index) => ({ position: String(index).padStart(2, '0'), value })); }; const isMaps = (categoryOfMaterial) => categoryOfMaterial === 'a'; if (isMaps(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 8); } const isElectronicResource = (categoryOfMaterial) => categoryOfMaterial === 'c'; if (isElectronicResource(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 14); spliceAndSetData(fieldInfos, 6, 9, value); } const isGlobe = (categoryOfMaterial) => categoryOfMaterial === 'd'; if (isGlobe(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 6); } const isTactileMaterial = (categoryOfMaterial) => categoryOfMaterial === 'f'; if (isTactileMaterial(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 10); spliceAndSetData(fieldInfos, 6, 9, value); spliceAndSetData(fieldInfos, 3, 5, value); } const isProjectedGraphic = (categoryOfMaterial) => categoryOfMaterial === 'g'; if (isProjectedGraphic(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 9); } const isMicroForm = (categoryOfMaterial) => categoryOfMaterial === 'h'; if (isMicroForm(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 13); spliceAndSetData(fieldInfos, 6, 9, value); } const isNonprojectedGraphic = (categoryOfMaterial) => categoryOfMaterial === 'k'; if (isNonprojectedGraphic(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 6); } const isMotionPicture = (categoryOfMaterial) => categoryOfMaterial === 'm'; if (isMotionPicture(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 23); spliceAndSetData(fieldInfos, 17, 23, value); } const isKit = (categoryOfMaterial) => categoryOfMaterial === 'o'; const isNotatedMusic = (categoryOfMaterial) => categoryOfMaterial === 'q'; const isText = (categoryOfMaterial) => categoryOfMaterial === 't'; const isUnspecified = (categoryOfMaterial) => categoryOfMaterial === 'z'; if (isKit(categoryOfMaterial) || isNotatedMusic(categoryOfMaterial) || isText(categoryOfMaterial) || isUnspecified(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 2); } const isRemoteSensingImage = (categoryOfMaterial) => categoryOfMaterial === 'r'; if (isRemoteSensingImage(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 11); spliceAndSetData(fieldInfos, 9, 11, value); } const isSoundRecording = (categoryOfMaterial) => categoryOfMaterial === 's'; if (isSoundRecording(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 14); } const isVideoRecording = (categoryOfMaterial) => categoryOfMaterial === 'v'; if (isVideoRecording(categoryOfMaterial)) { fieldInfos = initFieldInfos(value, 9); } return fieldInfos.reduce((accumulator, current) => { accumulator.positions[current.position] = current.value; return accumulator; }, { positions: {} }); } function formatField008 (value, typeOfRecord, bibliographicLevel) { const fieldInfos = value.padStart(40).split('').map((value, index) => ({ position: String(index).padStart(2, '0'), value })); spliceAndSetData(fieldInfos, 35, 38, value); const isBooks = (typeOfRecord === 'a' && ['a', 'c', 'd', 'm'].includes(bibliographicLevel)) || typeOfRecord === 't'; if (isBooks) { spliceAndSetData(fieldInfos, 24, 28, value); spliceAndSetData(fieldInfos, 18, 22, value); } const isComputerFields = typeOfRecord === 'm'; if (isComputerFields) { spliceAndSetData(fieldInfos, 29, 35, value); spliceAndSetData(fieldInfos, 24, 26, value); spliceAndSetData(fieldInfos, 18, 22, value); } const isMaps = ['e', 'f'].includes(typeOfRecord); if (isMaps) { spliceAndSetData(fieldInfos, 33, 35, value); spliceAndSetData(fieldInfos, 26, 28, value); spliceAndSetData(fieldInfos, 22, 24, value); spliceAndSetData(fieldInfos, 18, 22, value); } const isMusic = ['c', 'd', 'i', 'j'].includes(typeOfRecord); if (isMusic) { spliceAndSetData(fieldInfos, 30, 32, value); spliceAndSetData(fieldInfos, 24, 30, value); spliceAndSetData(fieldInfos, 18, 20, value); } const isContinuingResources = typeOfRecord === 'a' && ['b', 'i', 's'].includes(bibliographicLevel); if (isContinuingResources) { spliceAndSetData(fieldInfos, 30, 33, value); } const isVisualMaterials = ['g', 'k', 'o', 'r'].includes(typeOfRecord); if (isVisualMaterials) { spliceAndSetData(fieldInfos, 30, 33, value); spliceAndSetData(fieldInfos, 23, 27, value); spliceAndSetData(fieldInfos, 18, 20, value); } const isMixedMaterials = typeOfRecord === 'p'; if (isMixedMaterials) { spliceAndSetData(fieldInfos, 24, 35, value); spliceAndSetData(fieldInfos, 18, 23, value); } spliceAndSetData(fieldInfos, 15, 18, value); spliceAndSetData(fieldInfos, 11, 15, value); spliceAndSetData(fieldInfos, 7, 11, value); spliceAndSetData(fieldInfos, 0, 6, value); return fieldInfos.reduce((accumulator, current) => { accumulator.positions[current.position] = current.value; return accumulator; }, { positions: {} }); } function getPreviousToken (arr, index) { if (index >= 1) { if (['whitespace', 'eol'].includes(arr[index - 1].type)) { return getPreviousToken(arr, index - 1); } else { return arr[index - 1]; } } else { return null; } } function spliceAndSetData (arr, start, end, data) { arr.splice(start, (end - start), { position: `${String(start).padStart(2, '0')}-${String(end - 1).padStart(2, '0')}`, value: data.substring(start, end) }); } function isFieldRepeatable (tag) { const repeatableTag = Object.keys(allJsonSchema.register.fields.properties).filter(field => { return allJsonSchema.register.fields[field].isRepeatable; }); return repeatableTag.includes(tag); } function getLeaderFrom (tokens) { const leader = tokens[0].slice(0, 24); return { positions: { '00-04': leader.slice(0, 5), '05': leader.charAt(5), '06': leader.charAt(6), '07': leader.charAt(7), '08': leader.charAt(8), '09': leader.charAt(9), 10: leader.charAt(10), 11: leader.charAt(11), '12-16': leader.slice(12, 17), 17: leader.charAt(17), 18: leader.charAt(18), 19: leader.charAt(19), 20: leader.charAt(20), 21: leader.charAt(21), 22: leader.charAt(22), 23: leader.charAt(23) } }; } function getDirectoryFrom (tokens) { return tokens[0].slice(24); } function getDirectoryEntriesFrom (directory) { return directory.match(/(.{1,12})/g) .map(entry => { return { tag: entry.slice(0, 3), lengthOfField: entry.slice(3, 7), startingCharacterPosition: entry.slice(7, 12) }; }); } function getVariableFieldsFrom (tokens) { return tokens.slice(1); } function getIndicator1From (dataFieldTokens) { return dataFieldTokens[0][0] === ' ' ? '\\' : dataFieldTokens[0][0]; } function getIndicator2From (dataFieldTokens) { return dataFieldTokens[0][1] === ' ' ? '\\' : dataFieldTokens[0][1]; } function getSubFieldFrom (dataFieldTokens) { return dataFieldTokens.slice(1) .map(subfield => { return { [subfield[0]]: subfield.slice(1).trim() }; }); }