UNPKG

@sanity/export

Version:

Export Sanity documents and assets

sanity-io/export

193 lines • 6.5 kB

JavaScript

import { createReadStream, existsSync, statSync } from 'node:fs'; import { basename, join } from 'node:path'; import { createInterface } from 'node:readline'; import { createGunzip } from 'node:zlib'; import tarStream from 'tar-stream'; // U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded const REPLACEMENT_CHAR = '\uFFFD'; /** * Scans a line for U+FFFD replacement characters */ function scanLine(line, lineNumber) { const index = line.indexOf(REPLACEMENT_CHAR); if (index === -1) return null; // Count total replacement chars on this line let count = 0; for (const char of line) { if (char === REPLACEMENT_CHAR) count++; } // Extract context around the corruption const contextStart = Math.max(0, index - 20); const contextEnd = Math.min(line.length, index + 30); const context = line.slice(contextStart, contextEnd); return { line: lineNumber, column: index + 1, context, count, }; } /** * Scans a readable stream (expecting UTF-8 text) for corruption */ async function scanStream(stream) { const corruptions = []; let lineNumber = 0; const rl = createInterface({ input: stream, crlfDelay: Infinity, }); for await (const line of rl) { lineNumber++; const corruption = scanLine(line, lineNumber); if (corruption) { corruptions.push(corruption); } } return corruptions; } /** * Scans an NDJSON file for UTF-8 corruption * * @param filePath - Path to the ndjson file * @returns Scan result with corruption information * @public */ export async function scanNdjsonFile(filePath) { const stream = createReadStream(filePath, { encoding: 'utf8' }); const corruptions = await scanStream(stream); const files = new Map(); if (corruptions.length > 0) { files.set(filePath, corruptions); } return { corrupted: corruptions.length > 0, files, totalCorruptedLines: corruptions.length, scannedFiles: [filePath], }; } /** * Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files * * @param filePath - Path to the tar.gz file * @returns Scan result with corruption information * @public */ export async function scanTarGz(filePath) { const extract = tarStream.extract(); const results = new Map(); const scannedFiles = []; const targetFiles = ['data.ndjson', 'asset.json']; return new Promise((resolve, reject) => { extract.on('entry', (header, stream, next) => { const fileBasename = basename(header.name); if (targetFiles.includes(fileBasename)) { scannedFiles.push(header.name); const chunks = []; stream.on('data', (chunk) => { chunks.push(chunk); }); stream.on('end', () => { // Combine all chunks and convert to string const content = Buffer.concat(chunks).toString('utf8'); const lines = content.split(/\r?\n/); const corruptions = []; for (let i = 0; i < lines.length; i++) { const line = lines[i]; if (line !== undefined && line.length > 0) { const corruption = scanLine(line, i + 1); if (corruption) { corruptions.push(corruption); } } } if (corruptions.length > 0) { results.set(header.name, corruptions); } next(); }); stream.on('error', reject); } else { // Skip this entry stream.on('end', next); stream.resume(); } }); extract.on('finish', () => { let totalCorruptedLines = 0; for (const corruptions of results.values()) { totalCorruptedLines += corruptions.length; } resolve({ corrupted: results.size > 0, files: results, totalCorruptedLines, scannedFiles, }); }); extract.on('error', reject); const gunzip = createGunzip(); gunzip.on('error', reject); createReadStream(filePath).pipe(gunzip).pipe(extract); }); } /** * Scans a directory for UTF-8 corruption in data.ndjson and assets.json files * * @param dirPath - Path to the directory * @returns Scan result with corruption information * @public */ export async function scanDirectory(dirPath) { const targetFiles = ['data.ndjson', 'assets.json']; const foundFiles = []; for (const filename of targetFiles) { const filePath = join(dirPath, filename); if (existsSync(filePath)) { foundFiles.push(filePath); } } if (foundFiles.length === 0) { throw new Error(`No data.ndjson or assets.json found in directory: ${dirPath}`); } const results = new Map(); const scannedFiles = []; let totalCorruptedLines = 0; for (const filePath of foundFiles) { const result = await scanNdjsonFile(filePath); scannedFiles.push(...result.scannedFiles); for (const [file, corruptions] of result.files) { results.set(file, corruptions); totalCorruptedLines += corruptions.length; } } return { corrupted: results.size > 0, files: results, totalCorruptedLines, scannedFiles, }; } /** * Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory) * * The corruption manifests as U+FFFD replacement characters appearing * where valid multi-byte characters (CJK, emoji, etc.) should be. * * @param filePath - Path to the file or directory to scan * @returns Scan result with corruption information * @public */ export async function detectCorruption(filePath) { const stat = statSync(filePath); if (stat.isDirectory()) { return scanDirectory(filePath); } const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz'); return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath); } //# sourceMappingURL=detectCorruption.js.map