@sanity/export
Version:
Export Sanity documents and assets
259 lines (220 loc) • 6.93 kB
text/typescript
import {createReadStream, existsSync, statSync} from 'node:fs'
import {basename, join} from 'node:path'
import {createInterface} from 'node:readline'
import type {Readable} from 'node:stream'
import {createGunzip} from 'node:zlib'
import tarStream from 'tar-stream'
// U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded
const REPLACEMENT_CHAR = '\uFFFD'
/**
* Information about corruption found on a specific line
* @public
*/
export interface CorruptionInfo {
/** Line number (1-indexed) */
line: number
/** Column position of first replacement char */
column: number
/** Surrounding text for context */
context: string
/** Number of replacement chars on this line */
count: number
}
/**
* Result of scanning a file for corruption
* @public
*/
export interface ScanResult {
/** Whether corruption was detected */
corrupted: boolean
/** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */
files: Map<string, CorruptionInfo[]>
/** Total number of corrupted lines across all files */
totalCorruptedLines: number
/** List of files that were scanned */
scannedFiles: string[]
}
/**
* Scans a line for U+FFFD replacement characters
*/
function scanLine(line: string, lineNumber: number): CorruptionInfo | null {
const index = line.indexOf(REPLACEMENT_CHAR)
if (index === -1) return null
// Count total replacement chars on this line
let count = 0
for (const char of line) {
if (char === REPLACEMENT_CHAR) count++
}
// Extract context around the corruption
const contextStart = Math.max(0, index - 20)
const contextEnd = Math.min(line.length, index + 30)
const context = line.slice(contextStart, contextEnd)
return {
line: lineNumber,
column: index + 1,
context,
count,
}
}
/**
* Scans a readable stream (expecting UTF-8 text) for corruption
*/
async function scanStream(stream: Readable): Promise<CorruptionInfo[]> {
const corruptions: CorruptionInfo[] = []
let lineNumber = 0
const rl = createInterface({
input: stream,
crlfDelay: Infinity,
})
for await (const line of rl) {
lineNumber++
const corruption = scanLine(line, lineNumber)
if (corruption) {
corruptions.push(corruption)
}
}
return corruptions
}
/**
* Scans an NDJSON file for UTF-8 corruption
*
* @param filePath - Path to the ndjson file
* @returns Scan result with corruption information
* @public
*/
export async function scanNdjsonFile(filePath: string): Promise<ScanResult> {
const stream = createReadStream(filePath, {encoding: 'utf8'})
const corruptions = await scanStream(stream)
const files = new Map<string, CorruptionInfo[]>()
if (corruptions.length > 0) {
files.set(filePath, corruptions)
}
return {
corrupted: corruptions.length > 0,
files,
totalCorruptedLines: corruptions.length,
scannedFiles: [filePath],
}
}
/**
* Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
*
* @param filePath - Path to the tar.gz file
* @returns Scan result with corruption information
* @public
*/
export async function scanTarGz(filePath: string): Promise<ScanResult> {
const extract = tarStream.extract()
const results = new Map<string, CorruptionInfo[]>()
const scannedFiles: string[] = []
const targetFiles = ['data.ndjson', 'asset.json']
return new Promise((resolve, reject) => {
extract.on('entry', (header, stream, next) => {
const fileBasename = basename(header.name)
if (targetFiles.includes(fileBasename)) {
scannedFiles.push(header.name)
const chunks: Buffer[] = []
stream.on('data', (chunk: Buffer) => {
chunks.push(chunk)
})
stream.on('end', () => {
// Combine all chunks and convert to string
const content = Buffer.concat(chunks).toString('utf8')
const lines = content.split(/\r?\n/)
const corruptions: CorruptionInfo[] = []
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
if (line !== undefined && line.length > 0) {
const corruption = scanLine(line, i + 1)
if (corruption) {
corruptions.push(corruption)
}
}
}
if (corruptions.length > 0) {
results.set(header.name, corruptions)
}
next()
})
stream.on('error', reject)
} else {
// Skip this entry
stream.on('end', next)
stream.resume()
}
})
extract.on('finish', () => {
let totalCorruptedLines = 0
for (const corruptions of results.values()) {
totalCorruptedLines += corruptions.length
}
resolve({
corrupted: results.size > 0,
files: results,
totalCorruptedLines,
scannedFiles,
})
})
extract.on('error', reject)
const gunzip = createGunzip()
gunzip.on('error', reject)
createReadStream(filePath).pipe(gunzip).pipe(extract)
})
}
/**
* Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
*
* @param dirPath - Path to the directory
* @returns Scan result with corruption information
* @public
*/
export async function scanDirectory(dirPath: string): Promise<ScanResult> {
const targetFiles = ['data.ndjson', 'assets.json']
const foundFiles: string[] = []
for (const filename of targetFiles) {
const filePath = join(dirPath, filename)
if (existsSync(filePath)) {
foundFiles.push(filePath)
}
}
if (foundFiles.length === 0) {
throw new Error(
`No data.ndjson or assets.json found in directory: ${dirPath}`,
)
}
const results = new Map<string, CorruptionInfo[]>()
const scannedFiles: string[] = []
let totalCorruptedLines = 0
for (const filePath of foundFiles) {
const result = await scanNdjsonFile(filePath)
scannedFiles.push(...result.scannedFiles)
for (const [file, corruptions] of result.files) {
results.set(file, corruptions)
totalCorruptedLines += corruptions.length
}
}
return {
corrupted: results.size > 0,
files: results,
totalCorruptedLines,
scannedFiles,
}
}
/**
* Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
*
* The corruption manifests as U+FFFD replacement characters appearing
* where valid multi-byte characters (CJK, emoji, etc.) should be.
*
* @param filePath - Path to the file or directory to scan
* @returns Scan result with corruption information
* @public
*/
export async function detectCorruption(filePath: string): Promise<ScanResult> {
const stat = statSync(filePath)
if (stat.isDirectory()) {
return scanDirectory(filePath)
}
const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz')
return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath)
}