snyk-docker-plugin

import * as Debug from "debug"; import { createReadStream } from "fs"; import { normalize as normalizePath, sep as pathSeparator } from "path"; import { Readable } from "stream"; import { extract, Extract } from "tar-stream"; import { getPlatformFromConfig, InvalidArchiveError } from ".."; import { streamToJson } from "../../stream-utils"; import { PluginOptions } from "../../types"; import { decompressMaybe } from "../decompress-maybe"; import { extractImageLayer } from "../layer"; import { ExtractAction, ExtractedLayers, ExtractedLayersAndManifest, ImageConfig, OciArchiveManifest, OciImageIndex, OciManifestInfo, OciPlatformInfo, } from "../types"; const debug = Debug("snyk"); const MEDIATYPE_DOCKER_MANIFEST_V2 = "application/vnd.docker.distribution.manifest.v2+json"; const MEDIATYPE_DOCKER_MANIFEST_LIST_V2 = "application/vnd.docker.distribution.manifest.list.v2+json"; const MEDIATYPE_OCI_MANIFEST_V1 = "application/vnd.oci.image.manifest.v1+json"; const MEDIATYPE_OCI_MANIFEST_LIST_V1 = "application/vnd.oci.image.index.v1+json"; // Maximum size for JSON metadata files. Matches the limit in streamToJson. // Files larger than this are layer blobs, not JSON metadata. const MAX_JSON_SIZE_BYTES = 2 * 1024 * 1024; /** * Retrieve the products of files content from the specified oci-archive. * * Uses a two-pass approach: * 1. First pass: Parse JSON metadata (manifests, configs, indexes) to determine * which layers are needed for the target platform. * 2. Second pass: Extract only the required layer blobs. * * This avoids memory issues from buffering large layer blobs unnecessarily. * * @param ociArchiveFilesystemPath Path to image file saved in oci-archive format. * @param extractActions Array of pattern-callbacks pairs. * @param options PluginOptions * @returns Array of extracted files products sorted by the reverse order of the layers from last to first. */ export async function extractArchive( ociArchiveFilesystemPath: string, extractActions: ExtractAction[], options: PluginOptions, ): Promise<ExtractedLayersAndManifest> { // Pass 1: Extract JSON metadata const metadata = await extractMetadata(ociArchiveFilesystemPath); // Determine which manifest and layers we need const { manifest, imageConfig } = resolveManifestAndConfig(metadata, options); // Get the list of layer digests we need to extract const requiredLayerDigests = new Set( manifest.layers.map((layer) => layer.digest), ); // Pass 2: Extract the required layers const { layers, failedDigests } = await extractLayers( ociArchiveFilesystemPath, requiredLayerDigests, extractActions, ); // Report any layer extraction failures if (failedDigests.size > 0) { const failures = Array.from(failedDigests.entries()) .map(([digest, error]) => `${digest}: ${error}`) .join("; "); debug(`Failed to extract ${failedDigests.size} layer(s): ${failures}`); } // Build the result const filteredLayers = manifest.layers .filter((layer) => layers[layer.digest]) .map((layer) => layers[layer.digest]) .reverse(); if (filteredLayers.length === 0) { // Provide more context about why extraction failed if (failedDigests.size > 0) { const failedList = Array.from(failedDigests.keys()).join(", "); throw new InvalidArchiveError( `Failed to extract any layers from the image. ` + `${failedDigests.size} layer(s) failed: ${failedList}`, ); } throw new InvalidArchiveError( "We found no layers in the provided image. " + "The archive may be corrupted or in an unsupported format.", ); } // Warn if some but not all layers failed (partial extraction) const missingLayers = manifest.layers.filter( (layer) => !layers[layer.digest], ); if (missingLayers.length > 0) { debug( `Warning: ${missingLayers.length} layer(s) from manifest were not extracted: ` + missingLayers.map((l) => l.digest).join(", "), ); } return { layers: filteredLayers, manifest, imageConfig, }; } interface ArchiveMetadata { mainIndexFile?: OciImageIndex; manifests: Record<string, OciArchiveManifest>; indexFiles: Record<string, OciImageIndex>; configs: ImageConfig[]; } /** * Pass 1: Extract only JSON metadata from the archive. * * Skips large files (> MAX_JSON_SIZE_BYTES) since they're layer blobs, not JSON. * For small files, attempts JSON parse; binary data fails fast on the first byte check. */ async function extractMetadata( ociArchiveFilesystemPath: string, ): Promise<ArchiveMetadata> { return new Promise((resolve, reject) => { const tarExtractor: Extract = extract(); const manifests: Record<string, OciArchiveManifest> = {}; const configs: ImageConfig[] = []; let mainIndexFile: OciImageIndex | undefined; const indexFiles: Record<string, OciImageIndex> = {}; tarExtractor.on("entry", async (header, stream, next) => { try { if (header.type === "file") { const normalizedHeaderName = normalizePath(header.name); if (isMainIndexFile(normalizedHeaderName)) { mainIndexFile = await streamToJson<OciImageIndex>(stream); } else if ( isBlobPath(normalizedHeaderName) && (header.size === undefined || header.size <= MAX_JSON_SIZE_BYTES) ) { // Small blob file - try to parse as JSON metadata // Large files and non-blob files (oci-layout, etc.) are skipped const jsonContent = await tryParseJsonMetadata(stream); if (jsonContent !== undefined) { const digest = getDigestFromPath(normalizedHeaderName); if (isArchiveManifest(jsonContent)) { manifests[digest] = jsonContent; } else if (isImageIndexFile(jsonContent)) { indexFiles[digest] = jsonContent as OciImageIndex; } else if (isImageConfigFile(jsonContent)) { configs.push(jsonContent as ImageConfig); } } } // All other files (non-blob, large blobs) are drained below } } catch (err) { debug( `Error processing OCI archive entry ${header.name}: ${err.message}`, ); } stream.resume(); // Drain the stream next(); }); tarExtractor.on("finish", () => { resolve({ mainIndexFile, manifests, indexFiles, configs }); }); tarExtractor.on("error", (error) => { reject(error); }); createReadStream(ociArchiveFilesystemPath) .pipe(decompressMaybe()) .pipe(tarExtractor); }); } /** * Attempts to parse a stream as JSON metadata. * Returns undefined if the stream doesn't contain valid JSON (e.g., it's a layer blob). * * Uses a fast-fail check: if the first byte isn't '{' or '[', it's not JSON. * Note: This doesn't handle JSON with leading whitespace, which is technically valid * but never produced by standard OCI tooling. */ async function tryParseJsonMetadata(stream: Readable): Promise<unknown> { return new Promise((resolve) => { let firstChunk = true; const chunks: string[] = []; let bytes = 0; let resolved = false; const cleanup = () => { stream.removeAllListeners("data"); stream.removeAllListeners("end"); // Keep a no-op error handler to prevent unhandled error events // when the stream is drained after fast-fail stream.removeAllListeners("error"); // tslint:disable-next-line:no-empty stream.on("error", () => {}); }; stream.on("data", (chunk: Buffer) => { if (firstChunk) { firstChunk = false; // Fast-fail: JSON must start with { or [ const firstByte = chunk[0]; if (firstByte !== 0x7b && firstByte !== 0x5b) { // 0x7b = '{', 0x5b = '[' resolved = true; cleanup(); resolve(undefined); return; } } bytes += chunk.length; if (bytes <= MAX_JSON_SIZE_BYTES) { chunks.push(chunk.toString("utf8")); } }); stream.on("end", () => { if (resolved) { return; } if (chunks.length === 0) { resolve(undefined); return; } try { resolve(JSON.parse(chunks.join(""))); } catch { resolve(undefined); } }); stream.on("error", () => { if (!resolved) { resolve(undefined); } }); }); } interface LayerExtractionResult { layers: Record<string, ExtractedLayers>; failedDigests: Map<string, string>; } /** * Pass 2: Extract only the specified layer blobs. * * Tracks extraction failures so the caller can report which layers failed * rather than silently returning incomplete results. */ async function extractLayers( ociArchiveFilesystemPath: string, requiredDigests: Set<string>, extractActions: ExtractAction[], ): Promise<LayerExtractionResult> { return new Promise((resolve, reject) => { const tarExtractor: Extract = extract(); const layers: Record<string, ExtractedLayers> = {}; const failedDigests: Map<string, string> = new Map(); tarExtractor.on("entry", async (header, stream, next) => { try { if (header.type === "file") { const normalizedHeaderName = normalizePath(header.name); if ( !isMainIndexFile(normalizedHeaderName) && isBlobPath(normalizedHeaderName) ) { const digest = getDigestFromPath(normalizedHeaderName); if (requiredDigests.has(digest)) { // This is a layer we need - extract it try { const layer = await extractImageLayer(stream, extractActions); layers[digest] = layer; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); debug(`Failed to extract layer ${digest}: ${errorMessage}`); failedDigests.set(digest, errorMessage); } } } } } catch (err) { debug(`Error processing archive entry ${header.name}: ${err.message}`); } stream.resume(); next(); }); tarExtractor.on("finish", () => { resolve({ layers, failedDigests }); }); tarExtractor.on("error", (error) => { reject(error); }); createReadStream(ociArchiveFilesystemPath) .pipe(decompressMaybe()) .pipe(tarExtractor); }); } /** * Checks if a path is in the blobs directory (blobs/<algo>/<hash>). * Non-blob files like oci-layout should be skipped. */ function isBlobPath(normalizedPath: string): boolean { const parts = normalizedPath.split(pathSeparator).filter(Boolean); return parts[0] === "blobs" && parts.length >= 3; } /** * Extracts digest from a blob path in the format blobs/<algo>/<hash>. * Returns the digest as <algo>:<hash> to match manifest digest format. * * Caller should verify isBlobPath() first. */ function getDigestFromPath(normalizedPath: string): string { const headerParts = normalizedPath.split(pathSeparator).filter(Boolean); const algorithm = headerParts[1]; const hash = headerParts[headerParts.length - 1]; return `${algorithm}:${hash}`; } function resolveManifestAndConfig( metadata: ArchiveMetadata, options: Partial<PluginOptions>, ): { manifest: OciArchiveManifest; imageConfig: ImageConfig; } { const filteredConfigs = metadata.configs.filter((config) => { return config?.os !== "unknown" || config?.architecture !== "unknown"; }); const platform = options?.platform || (filteredConfigs.length === 1 ? getPlatformFromConfig(filteredConfigs[0]) : "linux/amd64"); const platformInfo = getOciPlatformInfoFromOptionString(platform as string); const manifest = getManifest( metadata.mainIndexFile, metadata.manifests, metadata.indexFiles, platformInfo, ); if (!manifest) { throw new InvalidArchiveError( `Could not find manifest for platform ${platformInfo.os}/${platformInfo.architecture} in archive`, ); } const imageConfig = getImageConfig(metadata.configs, platformInfo); if (imageConfig === undefined) { throw new InvalidArchiveError( "Could not find the image config in the provided image", ); } return { manifest, imageConfig }; } function getManifest( imageIndex: OciImageIndex | undefined, manifestCollection: Record<string, OciArchiveManifest>, indexFiles: Record<string, OciImageIndex>, platformInfo: OciPlatformInfo, ): OciArchiveManifest | undefined { if (!imageIndex) { return manifestCollection[Object.keys(manifestCollection)[0]]; } const allManifests = getAllManifestsIndexItems(imageIndex, indexFiles); const manifestInfo = getImageManifestInfo(allManifests, platformInfo); if (manifestInfo === undefined) { throw new InvalidArchiveError( "Image does not support the requested CPU architecture or operating system", ); } return manifestCollection[manifestInfo.digest]; } function getAllManifestsIndexItems( imageIndex: OciImageIndex, indexFiles: Record<string, OciImageIndex>, ): OciManifestInfo[] { const allManifestsInfo: OciManifestInfo[] = []; for (const manifest of imageIndex.manifests) { if ( manifest.mediaType === MEDIATYPE_OCI_MANIFEST_V1 || manifest.mediaType === MEDIATYPE_DOCKER_MANIFEST_V2 ) { // an archive manifest file allManifestsInfo.push(manifest); } else if ( manifest.mediaType === MEDIATYPE_OCI_MANIFEST_LIST_V1 || manifest.mediaType === MEDIATYPE_DOCKER_MANIFEST_LIST_V2 ) { // nested index const index = indexFiles[manifest.digest]; if (index) { allManifestsInfo.push(...getAllManifestsIndexItems(index, indexFiles)); } } } return allManifestsInfo; } function isArchiveManifest(manifest: any): manifest is OciArchiveManifest { return ( manifest !== undefined && manifest.layers && Array.isArray(manifest.layers) ); } function isImageConfigFile(json: any): json is ImageConfig { return json !== undefined && json.architecture && json.rootfs; } function isImageIndexFile(json: any): boolean { return ( (json?.mediaType === MEDIATYPE_OCI_MANIFEST_LIST_V1 || json?.mediaType === MEDIATYPE_DOCKER_MANIFEST_LIST_V2) && Array.isArray(json?.manifests) ); } function isMainIndexFile(name: string): boolean { return name === "index.json"; } function getOciPlatformInfoFromOptionString(platform: string): OciPlatformInfo { const [os, architecture, variant] = platform.split("/") as [ os: string, architecture: string, variant: string | undefined, ]; return { os, architecture, variant, }; } function getImageManifestInfo( manifests: OciManifestInfo[], platformInfo: OciPlatformInfo, ): OciManifestInfo | undefined { // manifests do not always have a plaform, this is the case for OCI // images built with Docker when no platform is specified if (manifests.length === 1 && !manifests[0].platform) { return manifests[0]; } return getBestMatchForPlatform( manifests, platformInfo, (target: OciManifestInfo): OciPlatformInfo => { return { os: target.platform?.os, architecture: target.platform?.architecture, variant: target.platform?.variant, }; }, ); } function getImageConfig( manifests: ImageConfig[], platformInfo: OciPlatformInfo, ): ImageConfig | undefined { return getBestMatchForPlatform( manifests, platformInfo, (target: ImageConfig): OciPlatformInfo => { return { os: target.os, architecture: target.architecture, }; }, ); } function getBestMatchForPlatform<T>( manifests: T[], platformInfo: OciPlatformInfo, extractPlatformInfoFromManifest: (target: T) => OciPlatformInfo, ): T | undefined { const matches = manifests.filter((item) => { const { os, architecture } = extractPlatformInfoFromManifest(item); return os === platformInfo.os && architecture === platformInfo.architecture; }); if (matches.length > 1) { return matches.find((item) => { const { variant } = extractPlatformInfoFromManifest(item); return variant === platformInfo.variant; }); } return matches[0] || undefined; }