@sanity/export
Version:
Export Sanity documents and assets
193 lines • 6.5 kB
JavaScript
import { createReadStream, existsSync, statSync } from 'node:fs';
import { basename, join } from 'node:path';
import { createInterface } from 'node:readline';
import { createGunzip } from 'node:zlib';
import tarStream from 'tar-stream';
// U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded
const REPLACEMENT_CHAR = '\uFFFD';
/**
* Scans a line for U+FFFD replacement characters
*/
function scanLine(line, lineNumber) {
const index = line.indexOf(REPLACEMENT_CHAR);
if (index === -1)
return null;
// Count total replacement chars on this line
let count = 0;
for (const char of line) {
if (char === REPLACEMENT_CHAR)
count++;
}
// Extract context around the corruption
const contextStart = Math.max(0, index - 20);
const contextEnd = Math.min(line.length, index + 30);
const context = line.slice(contextStart, contextEnd);
return {
line: lineNumber,
column: index + 1,
context,
count,
};
}
/**
* Scans a readable stream (expecting UTF-8 text) for corruption
*/
async function scanStream(stream) {
const corruptions = [];
let lineNumber = 0;
const rl = createInterface({
input: stream,
crlfDelay: Infinity,
});
for await (const line of rl) {
lineNumber++;
const corruption = scanLine(line, lineNumber);
if (corruption) {
corruptions.push(corruption);
}
}
return corruptions;
}
/**
* Scans an NDJSON file for UTF-8 corruption
*
* @param filePath - Path to the ndjson file
* @returns Scan result with corruption information
* @public
*/
export async function scanNdjsonFile(filePath) {
const stream = createReadStream(filePath, { encoding: 'utf8' });
const corruptions = await scanStream(stream);
const files = new Map();
if (corruptions.length > 0) {
files.set(filePath, corruptions);
}
return {
corrupted: corruptions.length > 0,
files,
totalCorruptedLines: corruptions.length,
scannedFiles: [filePath],
};
}
/**
* Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
*
* @param filePath - Path to the tar.gz file
* @returns Scan result with corruption information
* @public
*/
export async function scanTarGz(filePath) {
const extract = tarStream.extract();
const results = new Map();
const scannedFiles = [];
const targetFiles = ['data.ndjson', 'asset.json'];
return new Promise((resolve, reject) => {
extract.on('entry', (header, stream, next) => {
const fileBasename = basename(header.name);
if (targetFiles.includes(fileBasename)) {
scannedFiles.push(header.name);
const chunks = [];
stream.on('data', (chunk) => {
chunks.push(chunk);
});
stream.on('end', () => {
// Combine all chunks and convert to string
const content = Buffer.concat(chunks).toString('utf8');
const lines = content.split(/\r?\n/);
const corruptions = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line !== undefined && line.length > 0) {
const corruption = scanLine(line, i + 1);
if (corruption) {
corruptions.push(corruption);
}
}
}
if (corruptions.length > 0) {
results.set(header.name, corruptions);
}
next();
});
stream.on('error', reject);
}
else {
// Skip this entry
stream.on('end', next);
stream.resume();
}
});
extract.on('finish', () => {
let totalCorruptedLines = 0;
for (const corruptions of results.values()) {
totalCorruptedLines += corruptions.length;
}
resolve({
corrupted: results.size > 0,
files: results,
totalCorruptedLines,
scannedFiles,
});
});
extract.on('error', reject);
const gunzip = createGunzip();
gunzip.on('error', reject);
createReadStream(filePath).pipe(gunzip).pipe(extract);
});
}
/**
* Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
*
* @param dirPath - Path to the directory
* @returns Scan result with corruption information
* @public
*/
export async function scanDirectory(dirPath) {
const targetFiles = ['data.ndjson', 'assets.json'];
const foundFiles = [];
for (const filename of targetFiles) {
const filePath = join(dirPath, filename);
if (existsSync(filePath)) {
foundFiles.push(filePath);
}
}
if (foundFiles.length === 0) {
throw new Error(`No data.ndjson or assets.json found in directory: ${dirPath}`);
}
const results = new Map();
const scannedFiles = [];
let totalCorruptedLines = 0;
for (const filePath of foundFiles) {
const result = await scanNdjsonFile(filePath);
scannedFiles.push(...result.scannedFiles);
for (const [file, corruptions] of result.files) {
results.set(file, corruptions);
totalCorruptedLines += corruptions.length;
}
}
return {
corrupted: results.size > 0,
files: results,
totalCorruptedLines,
scannedFiles,
};
}
/**
* Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
*
* The corruption manifests as U+FFFD replacement characters appearing
* where valid multi-byte characters (CJK, emoji, etc.) should be.
*
* @param filePath - Path to the file or directory to scan
* @returns Scan result with corruption information
* @public
*/
export async function detectCorruption(filePath) {
const stat = statSync(filePath);
if (stat.isDirectory()) {
return scanDirectory(filePath);
}
const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz');
return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath);
}
//# sourceMappingURL=detectCorruption.js.map