UNPKG

mcard-js

Version:

A JavaScript implementation of MCard - A data model for persistently storing content with cryptographic hashing and timestamping

437 lines (370 loc) 13.7 kB
/** * Utility functions for handling Buffer content stored in Node.js Buffer JSON format * Specifically handles the {"type":"Buffer","data":[...]} format */ /** * Converts a Buffer JSON object or string to a regular string * Works with all these formats: * - {"type":"Buffer","data":[...]} (object) * - '{"type":"Buffer","data":[...]}' (string) * - Regular string * - Regular objects * * @param {any} content - The content to process * @returns {string|null} Decoded string or null if not convertible */ export const convertBufferToString = (content) => { if (!content) return null; try { // Case 1: Direct Buffer JSON object if (typeof content === 'object' && content !== null && content.type === 'Buffer' && Array.isArray(content.data)) { const array = new Uint8Array(content.data); return new TextDecoder().decode(array); } // Case 2: JSON string containing serialized Buffer if (typeof content === 'string') { if (content.includes('"type":"Buffer"') && content.includes('"data":[')) { try { const bufferObj = JSON.parse(content); if (bufferObj && bufferObj.type === 'Buffer' && Array.isArray(bufferObj.data)) { const array = new Uint8Array(bufferObj.data); return new TextDecoder().decode(array); } } catch (e) { // If parsing fails, treat as regular string console.log("Failed to parse as Buffer JSON:", e); } } // Regular string, just return it return content; } // Case 3: Other object, convert to JSON string if (typeof content === 'object' && content !== null) { return JSON.stringify(content, null, 2); } // Case 4: Primitives, convert to string return String(content); } catch (e) { console.error("Error converting buffer to string:", e); return typeof content === 'string' ? content : null; } }; /** * Get raw binary data from a Buffer-like object * @param {any} content - The content to extract binary data from * @returns {Uint8Array|null} Binary data as Uint8Array or null */ export const extractBinaryContent = (content) => { if (!content) return null; try { // Case 1: Direct Buffer JSON object if (typeof content === 'object' && content !== null && content.type === 'Buffer' && Array.isArray(content.data)) { return new Uint8Array(content.data); } // Case 2: JSON string containing serialized Buffer if (typeof content === 'string') { if (content.includes('"type":"Buffer"') && content.includes('"data":[')) { try { const bufferObj = JSON.parse(content); if (bufferObj && bufferObj.type === 'Buffer' && Array.isArray(bufferObj.data)) { return new Uint8Array(bufferObj.data); } } catch (e) { // If parsing fails, can't extract binary return null; } } // Regular string as bytes return new TextEncoder().encode(content); } return null; } catch (e) { console.error("Error extracting binary content:", e); return null; } }; /** * Detects the content type based on actual content analysis * * @param {any} content - The content to analyze * @returns {string|null} Detected content type or null if undetectable */ export const detectContentType = (content) => { // Early handling for direct Buffer format containing CSV data if (typeof content === 'object' && content !== null && content.type === 'Buffer' && Array.isArray(content.data)) { // Get the decoded string for analysis const decodedStr = convertBufferToString(content); // Debug the decoded content console.log("Decoded Buffer content:", decodedStr ? decodedStr.substring(0, 100) : "null"); // Simple CSV detection based on commas and line structure if (decodedStr && decodedStr.includes(',') && decodedStr.includes('\n') && decodedStr.split('\n').length > 1 && decodedStr.split('\n')[0].includes(',')) { console.log("Detected CSV pattern in buffer data"); return 'csv'; } // Check for TSV if (decodedStr && decodedStr.includes('\t') && decodedStr.includes('\n') && decodedStr.split('\n').length > 1 && decodedStr.split('\n')[0].includes('\t')) { console.log("Detected TSV pattern in buffer data"); return 'tsv'; } // Check for JSON if (decodedStr && (decodedStr.trim().startsWith('{') || decodedStr.trim().startsWith('['))) { try { JSON.parse(decodedStr); console.log("Detected JSON pattern in buffer data"); return 'json'; } catch (e) { // Not valid JSON } } } // Early detection for Buffer JSON format by examining binary patterns const binaryData = extractBinaryContent(content); if (binaryData) { // Check for file signatures (magic numbers) const fileSignature = detectFileSignature(binaryData); if (fileSignature) { console.log("Detected file type from signature:", fileSignature); return fileSignature; } } const textContent = convertBufferToString(content); if (!textContent) return 'bin'; // Size check to avoid processing very large content if (textContent.length > 1000000) { console.log("Content too large for detailed analysis, using basic checks"); return detectBasicContentType(textContent.slice(0, 1000)); } // Debug logging console.log("Analyzing content for type detection:", textContent.slice(0, 100) + "..."); try { // Check if it's valid JSON first try { JSON.parse(textContent); if (textContent.trim().startsWith('{') || textContent.trim().startsWith('[')) { return 'json'; } } catch (e) { // Not JSON, continue with other checks } // Check for XML if ((textContent.trim().startsWith('<?xml') || textContent.trim().startsWith('<')) && textContent.includes('</') && textContent.includes('>')) { return 'xml'; } // Check for HTML if (textContent.includes('<html') || textContent.includes('<body') || textContent.includes('<head') || textContent.includes('<!DOCTYPE html')) { return 'html'; } // Check for CSV format - improved detection if (textContent.includes(',') && textContent.includes('\n') && textContent.split('\n').filter(line => line.trim()).length > 1) { // Count lines with commas const lines = textContent.split('\n').filter(line => line.trim()); const linesWithCommas = lines.filter(line => line.includes(',')); // If more than 50% of lines have commas, likely a CSV if (linesWithCommas.length / lines.length > 0.5) { console.log("Detected CSV pattern"); return 'csv'; } } // Check for TSV format if (textContent.includes('\t') && textContent.includes('\n') && textContent.split('\n').filter(line => line.trim()).length > 1) { // Count lines with tabs const lines = textContent.split('\n').filter(line => line.trim()); const linesWithTabs = lines.filter(line => line.includes('\t')); // If more than 50% of lines have tabs, likely a TSV if (linesWithTabs.length / lines.length > 0.5) { return 'tsv'; } } // Check for SQL if ((textContent.includes('SELECT ') || textContent.includes('INSERT INTO ') || textContent.includes('CREATE TABLE ') || textContent.includes('UPDATE ')) && /;/.test(textContent)) { return 'sql'; } // Check for Markdown if ((textContent.includes('# ') || textContent.includes('## ') || textContent.includes('```') || textContent.includes('**')) && !textContent.includes('<html') && !textContent.includes('<body')) { return 'md'; } // Check for CSS if ((textContent.includes('{') && textContent.includes('}') && textContent.includes(':') && textContent.includes(';')) && /[.#]?[a-zA-Z][a-zA-Z0-9_-]*\s*\{/.test(textContent)) { return 'css'; } // Check for JavaScript if ((textContent.includes('function') || textContent.includes('=>') || textContent.includes('var ') || textContent.includes('let ') || textContent.includes('const ') || textContent.includes('import ')) && /[;{}()]/.test(textContent)) { return 'js'; } // Check for YAML if ((/^\s*[a-zA-Z_][a-zA-Z0-9_]*\s*:/.test(textContent) || /^\s*-\s+[a-zA-Z_][a-zA-Z0-9_]*\s*:/.test(textContent)) && !textContent.includes('{') && !textContent.includes('}')) { return 'yaml'; } // Default to text if content is mostly text characters const textChars = textContent.replace(/[\n\r\t ]/g, '').length; const nonTextChars = /[^\x20-\x7E\n\r\t]/.test(textContent); if (textChars > 0 && !nonTextChars) { return 'txt'; } // Last resort: binary return 'bin'; } catch (e) { console.error("Error during content type detection:", e); return 'bin'; } }; /** * Detects file signatures (magic numbers) from binary data * @param {Uint8Array} data - Binary data to analyze * @returns {string|null} File type or null if unrecognized */ const detectFileSignature = (data) => { if (!data || data.length < 4) return null; // Common file signatures (magic numbers) // PNG: 89 50 4E 47 0D 0A 1A 0A if (data[0] === 0x89 && data[1] === 0x50 && data[2] === 0x4E && data[3] === 0x47) { return 'png'; } // JPEG: FF D8 FF if (data[0] === 0xFF && data[1] === 0xD8 && data[2] === 0xFF) { return 'jpg'; } // GIF87a: 47 49 46 38 37 61 // GIF89a: 47 49 46 38 39 61 if (data[0] === 0x47 && data[1] === 0x49 && data[2] === 0x46 && data[3] === 0x38 && (data[4] === 0x37 || data[4] === 0x39) && data[5] === 0x61) { return 'gif'; } // PDF: 25 50 44 46 if (data[0] === 0x25 && data[1] === 0x50 && data[2] === 0x44 && data[3] === 0x46) { return 'pdf'; } // ZIP: 50 4B 03 04 if (data[0] === 0x50 && data[1] === 0x4B && data[2] === 0x03 && data[3] === 0x04) { return 'zip'; } // check for BMP: 42 4D if (data[0] === 0x42 && data[1] === 0x4D) { return 'bmp'; } // Check for WEBP: 52 49 46 46 followed by WEBP if (data[0] === 0x52 && data[1] === 0x49 && data[2] === 0x46 && data[3] === 0x46 && data.length > 11 && data[8] === 0x57 && data[9] === 0x45 && data[10] === 0x42 && data[11] === 0x50) { return 'webp'; } // Check if it looks like a text file (mostly ASCII characters) let textChars = 0; let totalChars = Math.min(data.length, 100); // Check first 100 bytes for (let i = 0; i < totalChars; i++) { if ((data[i] >= 32 && data[i] <= 126) || data[i] === 9 || data[i] === 10 || data[i] === 13) { textChars++; } } if (textChars / totalChars > 0.9) { // This is likely a text file return null; // Let the text content analysis take over } return 'bin'; // Default to binary if no specific format recognized }; /** * Performs basic content type detection for very large files * Only looks at the first portion of the content */ const detectBasicContentType = (sample) => { if (!sample) return 'bin'; // Quick check for common formats based on initial characters if (sample.trim().startsWith('{') || sample.trim().startsWith('[')) { try { JSON.parse(sample); return 'json'; } catch (e) { // Not valid JSON } } if (sample.trim().startsWith('<?xml') || (sample.trim().startsWith('<') && sample.includes('</') && sample.includes('>'))) { return 'xml'; } if (sample.includes('<html') || sample.includes('<!DOCTYPE html')) { return 'html'; } // Check if it's mostly text const nonTextRatio = (sample.replace(/[\x20-\x7E\r\n\t]/g, '').length / sample.length); if (nonTextRatio < 0.1) { // Mostly text, try to detect format if (sample.includes('# ') || sample.includes('## ') || sample.includes('```')) { return 'md'; } if (sample.includes(',') && sample.split(/\r?\n/).some(line => line.includes(','))) { return 'csv'; } if (sample.includes('\t') && sample.split(/\r?\n/).some(line => line.includes('\t'))) { return 'tsv'; } return 'txt'; } return 'bin'; }; /** * Gets the MIME type from a simplified content type extension * * @param {string} simpleType - Simple content type like 'json', 'csv', etc. * @returns {string} The corresponding MIME type */ export const getMimeType = (simpleType) => { if (!simpleType) return 'application/octet-stream'; // Avoid generic "data" type if (simpleType === 'data') { return 'application/octet-stream'; } const mimeMap = { 'json': 'application/json', 'js': 'application/javascript', 'txt': 'text/plain', 'html': 'text/html', 'htm': 'text/html', 'css': 'text/css', 'svg': 'image/svg+xml', 'png': 'image/png', 'jpg': 'image/jpeg', 'jpeg': 'image/jpeg', 'gif': 'image/gif', 'pdf': 'application/pdf', 'csv': 'text/csv', 'tsv': 'text/tab-separated-values', 'xml': 'application/xml', 'md': 'text/markdown', 'yaml': 'application/yaml', 'yml': 'application/yaml', 'sql': 'application/sql', 'bin': 'application/octet-stream', 'zip': 'application/zip', 'bmp': 'image/bmp', 'webp': 'image/webp' }; return mimeMap[simpleType] || `application/${simpleType}`; };