@sanity/export
Version:
Export Sanity documents and assets
534 lines (448 loc) • 15.4 kB
text/typescript
import {createHash} from 'node:crypto'
import {createWriteStream, mkdirSync} from 'node:fs'
import {join as joinPath} from 'node:path'
import {pipeline} from 'node:stream/promises'
import PQueue from 'p-queue'
import {delay} from './util/delay.js'
import {through, throughObj} from './util/streamHelpers.js'
import {
ASSET_DOWNLOAD_CONCURRENCY,
ASSET_DOWNLOAD_MAX_RETRIES,
DEFAULT_RETRY_DELAY,
} from './constants.js'
import {debug} from './debug.js'
import {getUserAgent} from './getUserAgent.js'
import {requestStream} from './requestStream.js'
import type {
AssetDocument,
AssetMap,
AssetMetadata,
ResponseStream,
SanityClientLike,
SanityDocument,
} from './types.js'
import {rm} from 'node:fs/promises'
const EXCLUDE_PROPS = ['_id', '_type', 'assetId', 'extension', 'mimeType', 'path', 'url']
const ACTION_REMOVE = 'remove' as const
const ACTION_REWRITE = 'rewrite' as const
type AssetAction = typeof ACTION_REMOVE | typeof ACTION_REWRITE
interface AssetHandlerOptions {
client: SanityClientLike
tmpDir: string
prefix?: string
concurrency?: number
maxRetries?: number
retryDelayMs?: number
queue?: PQueue
}
interface AssetRequestOptions {
url: string
headers: Record<string, string>
}
interface AssetField {
asset: {
_ref: string
}
[key: string]: unknown
}
interface RewrittenAssetField {
_sanityAsset: string
[key: string]: unknown
}
interface DownloadError extends Error {
statusCode?: number
}
export class AssetHandler {
client: SanityClientLike
tmpDir: string
assetDirsCreated: boolean
downloading: string[]
assetsSeen: Map<string, string>
assetMap: AssetMap
filesWritten: number
queueSize: number
maxRetries: number
retryDelayMs: number | undefined
queue: PQueue
rejectedError: Error | null
reject: (err: Error) => void
constructor(options: AssetHandlerOptions) {
const concurrency = options.concurrency ?? ASSET_DOWNLOAD_CONCURRENCY
debug('Using asset download concurrency of %d', concurrency)
this.client = options.client
this.tmpDir = options.tmpDir
this.assetDirsCreated = false
this.downloading = []
this.assetsSeen = new Map()
this.assetMap = {}
this.filesWritten = 0
this.queueSize = 0
this.maxRetries = options.maxRetries ?? ASSET_DOWNLOAD_MAX_RETRIES
this.retryDelayMs = options.retryDelayMs
this.queue = options.queue ?? new PQueue({concurrency})
this.rejectedError = null
this.reject = (err: Error): void => {
this.rejectedError = err
}
}
clear(): void {
this.assetsSeen.clear()
this.queue.clear()
this.queueSize = 0
}
finish(): Promise<AssetMap> {
return new Promise((resolve, reject) => {
if (this.rejectedError) {
reject(this.rejectedError)
return
}
this.reject = reject
void this.queue.onIdle().then(() => resolve(this.assetMap))
})
}
// Called when we want to download all assets to local filesystem and rewrite documents to hold
// placeholder asset references (_sanityAsset: 'image@file:///local/path')
rewriteAssets = throughObj(
(doc: SanityDocument | AssetDocument, _enc: BufferEncoding, callback) => {
if (['sanity.imageAsset', 'sanity.fileAsset'].includes(doc._type)) {
const assetDoc = doc as AssetDocument
const type = doc._type === 'sanity.imageAsset' ? 'image' : 'file'
const filePath = `${type}s/${generateFilename(doc._id)}`
this.assetsSeen.set(doc._id, type)
this.queueAssetDownload(assetDoc, filePath)
callback()
return
}
callback(null, this.findAndModify(doc, ACTION_REWRITE))
},
)
// Called in the case where we don't _want_ assets, so basically just remove all asset documents
// as well as references to assets (*.asset._ref ^= (image|file)-)
stripAssets = throughObj((doc: SanityDocument, _enc: BufferEncoding, callback) => {
if (['sanity.imageAsset', 'sanity.fileAsset'].includes(doc._type)) {
callback()
return
}
callback(null, this.findAndModify(doc, ACTION_REMOVE))
})
// Called when we are using raw export mode along with `assets: false`, where we simply
// want to skip asset documents but retain asset references (useful for data mangling)
skipAssets = throughObj((doc: SanityDocument, _enc: BufferEncoding, callback) => {
const isAsset = ['sanity.imageAsset', 'sanity.fileAsset'].includes(doc._type)
if (isAsset) {
callback()
return
}
callback(null, doc)
})
noop = throughObj((doc: SanityDocument, _enc: BufferEncoding, callback) => callback(null, doc))
queueAssetDownload(assetDoc: AssetDocument, dstPath: string): void {
if (!assetDoc.url) {
debug('Asset document "%s" does not have a URL property, skipping', assetDoc._id)
return
}
debug('Adding download task for %s (destination: %s)', assetDoc._id, dstPath)
this.queueSize++
this.downloading.push(assetDoc.url)
const doDownload = async (): Promise<boolean> => {
let dlError: DownloadError | undefined
for (let attempt = 0; attempt < this.maxRetries; attempt++) {
try {
return await this.downloadAsset(assetDoc, dstPath)
} catch (err) {
const downloadError = err as DownloadError
// Ignore inaccessible assets
switch (downloadError.statusCode) {
case 401:
case 403:
case 404:
console.warn(
`⚠ Asset failed with HTTP %d (ignoring): %s`,
downloadError.statusCode,
assetDoc._id,
)
return true
default:
}
debug(
`Error downloading asset %s (destination: %s), attempt %d`,
assetDoc._id,
dstPath,
attempt,
err,
)
dlError = downloadError
if (
downloadError.statusCode &&
downloadError.statusCode >= 400 &&
downloadError.statusCode < 500
) {
// Don't retry on client errors
break
}
await delay(this.retryDelayMs ?? DEFAULT_RETRY_DELAY)
}
}
throw new Error(dlError?.message ?? 'Unknown error downloading asset')
}
this.queue
.add(() =>
doDownload().catch((err: unknown) => {
debug('Failed to download the asset, aborting download', err)
this.queue.clear()
this.reject(err instanceof Error ? err : new Error(String(err)))
}),
)
.catch((error: unknown) => {
debug('Queued task failed', error)
})
}
maybeCreateAssetDirs(): void {
if (this.assetDirsCreated) {
return
}
mkdirSync(joinPath(this.tmpDir, 'files'), {recursive: true})
mkdirSync(joinPath(this.tmpDir, 'images'), {recursive: true})
this.assetDirsCreated = true
}
getAssetRequestOptions(assetDoc: AssetDocument): AssetRequestOptions {
const token = this.client.config().token
const headers: Record<string, string> = {'User-Agent': getUserAgent()}
const isImage = assetDoc._type === 'sanity.imageAsset'
const url = URL.parse(assetDoc.url ?? '')
// If we can't parse it, return as-is
if (!url) {
return {url: assetDoc.url ?? '', headers}
}
if (
isImage &&
token &&
(url.hostname === 'cdn.sanity.io' ||
url.hostname === 'cdn.sanity.work' ||
// used in tests. use a very specific port to avoid conflicts
url.host === 'localhost:43216')
) {
headers.Authorization = `Bearer ${token}`
url.searchParams.set('dlRaw', 'true')
}
return {url: url.toString(), headers}
}
async downloadAsset(assetDoc: AssetDocument, dstPath: string): Promise<boolean> {
const {url} = assetDoc
debug('Downloading asset %s', url)
const options = this.getAssetRequestOptions(assetDoc)
let stream: ResponseStream
try {
stream = await requestStream({
maxRetries: 0, // We handle retries ourselves in queueAssetDownload
...options,
})
} catch (err) {
const message = 'Failed to create asset stream'
if (err instanceof Error) {
err.message = `${message}: ${err.message}`
throw err
}
throw new Error('Failed create asset stream', {cause: err})
}
if (stream.statusCode !== 200) {
let errMsg: string
try {
const err = await tryGetErrorFromStream(stream)
errMsg = `Referenced asset URL "${url}" returned HTTP ${stream.statusCode}`
if (err) {
errMsg = `${errMsg}: ${err}`
}
} catch (err) {
const message = 'Failed to parse error response from asset stream'
if (err instanceof Error) {
err.message = `${message}: ${err.message}`
throw err
}
throw new Error(message, {cause: err})
}
const streamError: DownloadError = new Error(errMsg)
if (stream.statusCode !== undefined) {
streamError.statusCode = stream.statusCode
}
throw streamError
}
this.maybeCreateAssetDirs()
debug('Asset stream ready, writing to filesystem at %s', dstPath)
const tmpPath = joinPath(this.tmpDir, dstPath)
let sha1 = ''
let md5 = ''
let size = 0
try {
const res = await writeHashedStream(tmpPath, stream)
sha1 = res.sha1
md5 = res.md5
size = res.size
} catch (err) {
const message = 'Failed to write asset stream to filesystem'
if (err instanceof Error) {
err.message = `${message}: ${err.message}`
throw err
}
throw new Error(message, {cause: err})
}
// Verify it against our downloaded stream to make sure we have the same copy
const contentLength = stream.headers?.['content-length']
const remoteSha1 = stream.headers?.['x-sanity-sha1']
const remoteMd5 = stream.headers?.['x-sanity-md5']
const hasHash = Boolean(remoteSha1 || remoteMd5)
const method = sha1 ? 'sha1' : 'md5'
// Asset validity is primarily determined by the sha1 hash. However, the sha1 hash is computed
// before certain processes (i.e. svg sanitization) which can result in a different hash.
// When the sha1 hashes don't match, fallback to using the md5 hash.
const sha1Differs = remoteSha1 && sha1 !== remoteSha1
const md5Differs = remoteMd5 && md5 !== remoteMd5
const differs = sha1Differs && md5Differs
if (differs) {
const details = [
hasHash &&
(method === 'md5'
? `md5 should be ${remoteMd5}, got ${md5}`
: `sha1 should be ${remoteSha1}, got ${sha1}`),
contentLength &&
parseInt(String(contentLength), 10) !== size &&
`Asset should be ${contentLength} bytes, got ${size}`,
]
const detailsString = `Details:\n - ${details.filter(Boolean).join('\n - ')}`
await rm(tmpPath, {recursive: true, force: true})
throw new Error(`Failed to download asset at ${assetDoc.url}. ${detailsString}`)
}
const isImage = assetDoc._type === 'sanity.imageAsset'
const type = isImage ? 'image' : 'file'
const id = `${type}-${sha1}`
const metaProps = omit(assetDoc, EXCLUDE_PROPS)
if (Object.keys(metaProps).length > 0) {
this.assetMap[id] = metaProps
}
this.downloading.splice(
this.downloading.findIndex((datUrl) => datUrl === url),
1,
)
this.filesWritten++
return true
}
findAndModify = (item: unknown, action: AssetAction): unknown => {
if (Array.isArray(item)) {
const children = item.map((child: unknown) => this.findAndModify(child, action))
return children.filter((child): child is NonNullable<typeof child> => child != null)
}
if (!item || typeof item !== 'object') {
return item
}
const record = item as Record<string, unknown>
const isAsset = isAssetField(record)
if (isAsset && action === ACTION_REMOVE) {
return undefined
}
if (isAsset && action === ACTION_REWRITE) {
const {asset, ...other} = record
const assetId = asset._ref
const assetType = getAssetType(record)
const filePath = `${assetType}s/${generateFilename(assetId)}`
const modified = this.findAndModify(other, action)
return {
_sanityAsset: `${assetType}@file://./${filePath}`,
...(typeof modified === 'object' && modified !== null ? modified : {}),
} as RewrittenAssetField
}
const newItem: Record<string, unknown> = {}
const keys = Object.keys(record)
for (const key of keys) {
const value = record[key]
newItem[key] = this.findAndModify(value, action)
if (typeof newItem[key] === 'undefined') {
// eslint-disable-next-line @typescript-eslint/no-dynamic-delete
delete newItem[key]
}
}
return newItem
}
}
function isAssetField(item: Record<string, unknown>): item is AssetField {
const asset = item.asset as {_ref?: unknown} | undefined
return Boolean(asset?._ref && typeof asset._ref === 'string' && isSanityAsset(asset._ref))
}
function getAssetType(item: Record<string, unknown>): string | null {
const asset = item.asset as {_ref?: unknown} | undefined
if (!asset || typeof asset._ref !== 'string') {
return null
}
const match = asset._ref.match(/^(image|file)-/)
return match?.[1] ?? null
}
function isSanityAsset(assetId: string): boolean {
return (
/^image-[a-f0-9]{40}-\d+x\d+-[a-z]+$/.test(assetId) ||
/^file-[a-f0-9]{40}-[a-z0-9]+$/.test(assetId)
)
}
function generateFilename(assetId: string): string {
const match = assetId.match(/^(image|file)-(.*?)(-[a-z]+)?$/)
const asset = match?.[2]
const ext = match?.[3]
const extension = (ext ?? 'bin').replace(/^-/, '')
return asset ? `${asset}.${extension}` : `${assetId}.bin`
}
interface HashResult {
size: number
sha1: string
md5: string
}
async function writeHashedStream(
filePath: string,
stream: NodeJS.ReadableStream,
): Promise<HashResult> {
let size = 0
const md5 = createHash('md5')
const sha1 = createHash('sha1')
const hasher = through((chunk, _enc, cb) => {
size += chunk.length
md5.update(chunk)
sha1.update(chunk)
cb(null, chunk)
})
await pipeline(stream, hasher, createWriteStream(filePath))
return {
size,
sha1: sha1.digest('hex'),
md5: md5.digest('hex'),
}
}
function tryGetErrorFromStream(stream: NodeJS.ReadableStream): Promise<string | null> {
return new Promise((resolve, reject) => {
const chunks: Buffer[] = []
let receivedData = false
stream.on('data', (chunk: Buffer) => {
receivedData = true
chunks.push(chunk)
})
stream.on('end', () => {
if (!receivedData) {
resolve(null)
return
}
const body = Buffer.concat(chunks)
try {
const parsed = JSON.parse(body.toString('utf8')) as {message?: string; error?: string}
resolve(parsed.message ?? parsed.error ?? null)
} catch {
resolve(body.toString('utf8').slice(0, 16000))
}
})
stream.on('error', reject)
})
}
function omit(obj: Record<string, unknown>, keys: string[]): AssetMetadata {
const copy: AssetMetadata = {}
for (const [key, value] of Object.entries(obj)) {
if (!keys.includes(key)) {
copy[key] = value
}
}
return copy
}