UNPKG

@cyclonedx/cdxgen

Version:

Creates CycloneDX Software Bill of Materials (SBOM) from source or container image

1,341 lines (1,294 loc) 37.7 kB
import process from "node:process"; import { createHuggingFaceComponentReference, createHuggingFaceDatasetReference, createHuggingFaceModelCard, } from "../../parsers/huggingfaceManifest.js"; import { detectAiModelVariants, normalizeDetectedVariants, } from "../aiModelVariants.js"; import { encodeHuggingFacePathSegments, HF_BASE_URL, HUGGING_FACE_ANCESTOR_RELATIONS, quantizationValueFromConfig, repositoryUrlForHuggingFaceAssetType, sanitizeHuggingFaceRepoId, toHuggingFaceAssetPath, toHuggingFaceAssetUrl, toHuggingFacePurl, } from "../huggingfaceUtils.js"; import { sanitizeStructuredValueForBom } from "../propertySanitizer.js"; import { cdxgenAgent, getLicenses, isDryRun, recordActivity, } from "../utils.js"; const CACHE_MISS = Symbol("huggingface-cache-miss"); const HUGGING_FACE_CACHE_TTL_MS = Number.parseInt(process.env.CDXGEN_HUGGINGFACE_CACHE_TTL_MS || "", 10) || 5 * 60 * 1000; const HUGGING_FACE_CACHE_MAX_ENTRIES = Number.parseInt(process.env.CDXGEN_HUGGINGFACE_CACHE_MAX_ENTRIES || "", 10) || 256; const HUGGING_FACE_REQUEST_TIMEOUT = { lookup: 1000, connect: 5000, secureConnect: 5000, socket: 10000, send: 10000, response: 10000, }; const HUGGING_FACE_ACCESS_TOKEN = process.env.HF_TOKEN || process.env.HUGGING_FACE_HUB_TOKEN || process.env.HUGGINGFACE_TOKEN; const HUGGING_FACE_MODEL_EXPAND_KEYS = [ "pipeline_tag", "private", "gated", "downloads", "likes", "lastModified", "author", "cardData", "config", "createdAt", "disabled", "downloadsAllTime", "inferenceProviderMapping", "library_name", "model-index", "safetensors", "sha", "siblings", "spaces", "tags", ]; const HUGGING_FACE_MAX_TAGS = 256; const HUGGING_FACE_MAX_TAG_LENGTH = 128; const HUGGING_FACE_MAX_BASE_MODELS = 20; const HUGGING_FACE_MAX_PEDIGREE_DEPTH = 3; const HUGGING_FACE_MAX_SIBLINGS = 10000; const HUGGING_FACE_DATASET_EXPAND_KEYS = [ "private", "downloads", "gated", "likes", "lastModified", "author", "cardData", "citation", "createdAt", "description", "disabled", "downloadsAllTime", "paperswithcode_id", "sha", "tags", ]; const HUGGING_FACE_SPACE_EXPAND_KEYS = [ "sdk", "likes", "private", "lastModified", "author", "cardData", "datasets", "disabled", "createdAt", "models", "runtime", "sha", "subdomain", "tags", ]; const createExpiringCache = () => { const entries = new Map(); const deleteExpiredEntries = (now = Date.now()) => { for (const [key, entry] of entries) { if (entry.expiresAt <= now) { entries.delete(key); } } }; return { clear() { entries.clear(); }, get(key) { deleteExpiredEntries(); const entry = entries.get(key); if (!entry) { return { hit: false, value: undefined }; } entries.delete(key); entries.set(key, entry); return { hit: true, value: entry.value === CACHE_MISS ? undefined : entry.value, }; }, set(key, value) { deleteExpiredEntries(); entries.delete(key); entries.set(key, { expiresAt: Date.now() + HUGGING_FACE_CACHE_TTL_MS, value: value === undefined ? CACHE_MISS : value, }); while (entries.size > HUGGING_FACE_CACHE_MAX_ENTRIES) { entries.delete(entries.keys().next().value); } }, }; }; const responseCache = createExpiringCache(); const payloadCache = createExpiringCache(); export { normalizeHuggingFaceReference, toHuggingFacePurl, } from "../huggingfaceUtils.js"; /** * Clear the in-process Hugging Face caches used for remote metadata lookup. */ export function resetHuggingFaceRemoteCaches() { responseCache.clear(); payloadCache.clear(); } const normalizeLicenseFilePath = (licensePath) => String(licensePath || "") .trim() .replace(/^\/+?/u, "") .replaceAll(/\/+?/gu, "/"); const apiPathForType = (assetType, repoId) => { const encodedRepoId = encodeHuggingFacePathSegments(repoId); switch (assetType) { case "dataset": return `datasets/${encodedRepoId}`; case "space": return `spaces/${encodedRepoId}`; default: return `models/${encodedRepoId}`; } }; const expandQueryForType = (assetType) => { const expandKeys = assetType === "dataset" ? HUGGING_FACE_DATASET_EXPAND_KEYS : assetType === "space" ? HUGGING_FACE_SPACE_EXPAND_KEYS : HUGGING_FACE_MODEL_EXPAND_KEYS; return new URLSearchParams( expandKeys.map((key) => ["expand", key]), ).toString(); }; const resolveHuggingFaceAccessToken = (options = {}) => options.huggingFaceAccessToken || options.huggingFaceToken || HUGGING_FACE_ACCESS_TOKEN; const resolveHuggingFaceRevision = (options = {}) => { const revision = options.huggingFaceRevision || options.version || options.revision || "HEAD"; return String(revision || "HEAD").trim() || "HEAD"; }; const toTrustedHuggingFaceUrl = (candidate) => { if (typeof candidate !== "string") { return undefined; } const trimmed = candidate.trim(); if (!trimmed) { return undefined; } try { const parsed = new URL(trimmed); if (!/^https?:$/iu.test(parsed.protocol)) { return undefined; } if (parsed.username || parsed.password) { return undefined; } if ( parsed.hostname !== "huggingface.co" && !parsed.hostname.endsWith(".huggingface.co") ) { return undefined; } parsed.search = ""; parsed.hash = ""; return parsed.toString().replace(/\/+$/u, ""); } catch { return undefined; } }; const selectLicenseFileUrl = (payload, assetType, repoId) => { const cardData = payload?.cardData || {}; for (const candidate of [ cardData?.license_link, cardData?.licenseLink, cardData?.license_url, cardData?.licenseUrl, payload?.licenseUrl, payload?.license_url, ]) { const trustedUrl = toTrustedHuggingFaceUrl(candidate); if (trustedUrl) { return trustedUrl; } } const licenseSibling = normalizeArray(payload?.siblings).find((sibling) => { const siblingPath = normalizeLicenseFilePath( sibling?.rfilename || sibling?.path || sibling?.name, ); return /(?:^|\/)(?:licen[cs]e|copying|copyright)(?:\.[^/]+)?$/iu.test( siblingPath, ); }); const licensePath = normalizeLicenseFilePath( licenseSibling?.rfilename || licenseSibling?.path || licenseSibling?.name, ); if (!licensePath) { return undefined; } const revision = payload?.sha || "main"; const assetPath = toHuggingFaceAssetPath(assetType, repoId); if (!assetPath) { return undefined; } return `${HF_BASE_URL}/${assetPath}/resolve/${encodeURIComponent(revision)}/${encodeHuggingFacePathSegments(licensePath)}`; }; const createExternalReference = (type, url, comment) => { const sanitizedUrl = toTrustedHuggingFaceUrl(url) || sanitizeStructuredValueForBom(url); if (!sanitizedUrl || typeof sanitizedUrl !== "string") { return undefined; } const reference = { type, url: sanitizedUrl }; if (comment) { reference.comment = comment; } return reference; }; const createGenericExternalReference = (type, candidate, comment) => { if (typeof candidate !== "string") { return undefined; } const trimmed = candidate.trim(); if (!trimmed) { return undefined; } try { const parsed = new URL(trimmed); parsed.username = ""; parsed.password = ""; parsed.search = ""; parsed.hash = ""; const reference = { type, url: parsed.toString() }; if (comment) { reference.comment = comment; } return reference; } catch { return undefined; } }; const uniqueExternalReferences = (references) => [ ...new Map( references .filter(Boolean) .map((reference) => [`${reference.type}:${reference.url}`, reference]), ).values(), ]; const selectLicenseValue = (payload) => { const cardData = payload?.cardData || {}; return ( payload?.license || cardData?.license || cardData?.license_name || cardData?.licenseName ); }; const normalizeLicenseValue = (licenseValue) => { if (typeof licenseValue !== "string") { return licenseValue; } const normalized = licenseValue.trim(); if (!normalized) { return normalized; } return normalized.toUpperCase(); }; const toLicenseSpec = (payload, assetType, repoId) => { const type = normalizeLicenseValue(selectLicenseValue(payload)); const url = selectLicenseFileUrl(payload, assetType, repoId); if (!type && !url) { return undefined; } return [{ type, url }]; }; const toComponentType = (assetType) => { switch (assetType) { case "dataset": return "data"; case "space": return "application"; default: return "machine-learning-model"; } }; const toDescription = (payload) => payload?.description || payload?.cardData?.model_description || payload?.cardData?.summary || payload?.cardData?.description; const normalizeArray = (value) => Array.isArray(value) ? value : value === undefined ? [] : [value]; const normalizeTagValues = (values) => [ ...new Set( normalizeArray(values) .flatMap((value) => normalizeArray(value)) .map((tag) => (typeof tag === "string" ? tag.trim() : undefined)) .filter(Boolean) .map((tag) => tag.slice(0, HUGGING_FACE_MAX_TAG_LENGTH)), ), ].slice(0, HUGGING_FACE_MAX_TAGS); const normalizeBaseModelReferences = (cardData, options = {}) => { const maxBaseModels = Math.max( 1, Number.parseInt(options.maxBaseModels, 10) || HUGGING_FACE_MAX_BASE_MODELS, ); return [ ...new Set( normalizeArray(cardData?.base_model) .concat(normalizeArray(cardData?.base_models)) .map((value) => (typeof value === "string" ? value.trim() : undefined)) .filter(Boolean), ), ].slice(0, maxBaseModels); }; const toFiniteNumber = (value) => { const normalizedValue = typeof value === "string" ? Number(value.replaceAll(/,/gu, "")) : Number(value); return Number.isFinite(normalizedValue) ? normalizedValue : undefined; }; const formatCompactCount = (value) => { const normalizedValue = toFiniteNumber(value); return normalizedValue === undefined ? undefined : new Intl.NumberFormat("en-US", { maximumFractionDigits: normalizedValue >= 10 ? 0 : 1, notation: "compact", }).format(normalizedValue); }; const formatByteSize = (value) => { const normalizedValue = toFiniteNumber(value); if (normalizedValue === undefined || normalizedValue < 0) { return undefined; } if (normalizedValue < 1024) { return `${normalizedValue} B`; } const units = ["KB", "MB", "GB", "TB"]; let size = normalizedValue; let unitIndex = -1; while (size >= 1024 && unitIndex < units.length - 1) { size /= 1024; unitIndex += 1; } return `${Number(size.toFixed(size >= 10 ? 0 : 1))} ${units[unitIndex]}`; }; const appendUniqueProperty = (properties, name, value) => { if (value === undefined || value === null || value === "") { return; } if ( !properties.some( (property) => property?.name === name && property?.value === String(value), ) ) { properties.push({ name, value: String(value) }); } }; const flattenDatasetInfoEntries = (datasetInfo) => { if (!datasetInfo) { return []; } if (Array.isArray(datasetInfo)) { return datasetInfo; } if (typeof datasetInfo === "object") { return Object.values(datasetInfo); } return []; }; const extractDatasetStats = (payload) => { const datasetInfoEntries = flattenDatasetInfoEntries( payload?.cardData?.dataset_info || payload?.cardData?.datasetInfo || payload?.dataset_info || payload?.datasetInfo, ); const splitEntries = datasetInfoEntries.flatMap((entry) => flattenDatasetInfoEntries(entry?.splits), ); const rowCount = [ toFiniteNumber(payload?.cardData?.dataset_size), toFiniteNumber(payload?.dataset_size), toFiniteNumber(payload?.cardData?.num_rows), toFiniteNumber(payload?.num_rows), splitEntries.reduce( (sum, split) => sum + (toFiniteNumber(split?.num_examples) ?? toFiniteNumber(split?.num_rows) ?? 0), 0, ) || undefined, ].find((value) => value !== undefined); const sizeBytes = [ toFiniteNumber(payload?.cardData?.size_in_bytes), toFiniteNumber(payload?.cardData?.dataset_size), toFiniteNumber(payload?.size_in_bytes), toFiniteNumber(payload?.dataset_size), toFiniteNumber(payload?.cardData?.download_size), toFiniteNumber(payload?.download_size), ].find((value) => value !== undefined); const splitCount = splitEntries.length || flattenDatasetInfoEntries(payload?.cardData?.splits).length || undefined; return { rowCount, sizeBytes, splitCount, }; }; const datasetDescriptionFromStats = (payload) => { const descriptionParts = [toDescription(payload)]; const datasetStats = extractDatasetStats(payload); const sizeParts = [ datasetStats.rowCount !== undefined ? `${new Intl.NumberFormat("en-US").format(datasetStats.rowCount)} rows` : undefined, datasetStats.splitCount !== undefined ? `${datasetStats.splitCount} split(s)` : undefined, formatByteSize(datasetStats.sizeBytes), ].filter(Boolean); if (sizeParts.length) { descriptionParts.push(`Dataset size: ${sizeParts.join(", ")}`); } return descriptionParts.filter(Boolean).join(". "); }; const extractSafetensorMetadata = (payload) => { const totalParameters = toFiniteNumber(payload?.cardData?.parameters) || toFiniteNumber(payload?.config?.num_parameters) || toFiniteNumber(payload?.safetensors?.total) || Object.values(payload?.safetensors?.parameters || {}).reduce( (sum, value) => sum + (toFiniteNumber(value) || 0), 0, ) || undefined; const tensorTypes = Object.entries(payload?.safetensors?.parameters || {}) .filter(([, value]) => toFiniteNumber(value) !== undefined) .sort( (left, right) => (toFiniteNumber(right[1]) || 0) - (toFiniteNumber(left[1]) || 0), ) .map(([tensorType]) => String(tensorType)); return { parameterCount: totalParameters, parameterCountLabel: totalParameters ? `${formatCompactCount(totalParameters)} params` : undefined, tensorTypes: [...new Set(tensorTypes)], }; }; const createRemoteEvidence = ( field, concludedValue, assetType, repoId, sourceUrl, revision, ) => ({ identity: [ { field, confidence: 0.7, concludedValue, methods: [ { technique: "other", confidence: 0.7, value: `${assetType} metadata from Hugging Face API for ${repoId}${revision ? ` @ ${revision}` : ""}`, }, ...(sourceUrl ? [ { technique: "other", confidence: 0.7, value: sourceUrl, }, ] : []), { technique: "source-code-analysis", confidence: 0.6, value: `huggingface:${assetType}:${repoId}`, }, ], }, ], }); const createPedigreeModelReference = (modelRef) => { return createHuggingFaceComponentReference(modelRef, { includeDatasetPurl: false, }); }; const createDatasetReference = (dataset) => { const datasetReference = createHuggingFaceDatasetReference(dataset, { componentScope: "excluded", componentSource: "huggingface-api", componentTags: ["ai", "dataset", "huggingface"], urlSanitizer: toTrustedHuggingFaceUrl, }); return datasetReference ? { component: { ...datasetReference.component, scope: "excluded", }, ref: datasetReference.ref, } : undefined; }; const createModelReference = (modelRef) => { const reference = createHuggingFaceComponentReference(modelRef, { includeDatasetPurl: false, }); return reference ? { component: { ...reference, properties: [ { name: "cdx:ai:provider", value: "huggingface" }, { name: "cdx:ai:kind", value: "model" }, { name: "cdx:ai:source", value: "huggingface-space-metadata" }, ], tags: ["ai", "huggingface", "model"], }, ref: reference["bom-ref"], } : undefined; }; const createHuggingFaceExternalReferences = ( payload, assetType, repoId, relatedSpaces = [], ) => uniqueExternalReferences([ { type: "distribution", url: toHuggingFaceAssetUrl(assetType, repoId), }, createExternalReference( "license", selectLicenseFileUrl(payload, assetType, repoId), ), createGenericExternalReference( "citation", payload?.doi?.id ? `https://doi.org/${payload.doi.id}` : undefined, ), ...normalizeArray(payload?.arxivIds).map((arxivId) => createGenericExternalReference( "citation", `https://arxiv.org/abs/${String(arxivId).trim()}`, ), ), ...relatedSpaces .slice(0, 5) .map((spaceRepoId) => createGenericExternalReference( "website", toHuggingFaceAssetUrl("space", spaceRepoId), "Related Hugging Face Space", ), ), ]); const toModelCard = (payload, addDatasetReference) => { const modelCard = createHuggingFaceModelCard( { ...(payload?.cardData || {}), pipeline_tag: payload?.pipeline_tag || payload?.cardData?.pipeline_tag, }, payload?.config, addDatasetReference, { urlSanitizer: toTrustedHuggingFaceUrl }, ) || {}; const safetensorMetadata = extractSafetensorMetadata(payload); const modelCardProperties = [...normalizeArray(modelCard.properties)]; appendUniqueProperty( modelCardProperties, "cdx:ai:safetensors:parameterCount", safetensorMetadata.parameterCount, ); appendUniqueProperty( modelCardProperties, "cdx:ai:safetensors:parameterCountLabel", safetensorMetadata.parameterCountLabel, ); for (const tensorType of safetensorMetadata.tensorTypes) { appendUniqueProperty( modelCardProperties, "cdx:ai:safetensors:tensorType", tensorType, ); } if (modelCardProperties.length) { modelCard.properties = modelCardProperties; } return Object.keys(modelCard).length ? sanitizeStructuredValueForBom(modelCard) : undefined; }; const fetchHuggingFacePayload = async (assetType, repoId, _options = {}) => { const normalizedRepoId = sanitizeHuggingFaceRepoId(repoId); if (!normalizedRepoId) { return undefined; } const normalizedAssetType = ["dataset", "space"].includes(assetType) ? assetType : "model"; const revision = resolveHuggingFaceRevision(_options); const accessToken = resolveHuggingFaceAccessToken(_options); const useCache = !accessToken; const cacheKey = `${normalizedAssetType}:${normalizedRepoId}:${revision}`; if (useCache) { const cachedPayload = payloadCache.get(cacheKey); if (cachedPayload.hit) { return cachedPayload.value; } } const targetUrl = `${HF_BASE_URL}/api/${apiPathForType(normalizedAssetType, normalizedRepoId)}/revision/${encodeURIComponent(revision)}?${expandQueryForType(normalizedAssetType)}`; if (isDryRun) { recordActivity({ kind: "network", networkIntent: "metadata-fetch", reason: "Dry run mode blocks outbound network access (metadata-fetch).", status: "blocked", target: targetUrl, }); if (useCache) { payloadCache.set(cacheKey, undefined); } return undefined; } try { const response = await cdxgenAgent.get(targetUrl, { headers: accessToken ? { Authorization: `Bearer ${accessToken}` } : undefined, responseType: "json", timeout: HUGGING_FACE_REQUEST_TIMEOUT, }); if (!response?.body) { if (useCache) { payloadCache.set(cacheKey, undefined); } return undefined; } const payload = response.body; if (useCache) { payloadCache.set(cacheKey, payload); } return payload; } catch { if (useCache) { payloadCache.set(cacheKey, undefined); } return undefined; } }; const resolvePedigreeComponent = async ( modelRef, options, ancestryTrail = new Set(), currentDepth = 1, ) => { const reference = createPedigreeModelReference(modelRef); if (!reference?.["bom-ref"]) { return undefined; } if ( reference.type !== "machine-learning-model" || ancestryTrail.has(reference["bom-ref"]) ) { return reference; } const maxDepth = Math.max( 1, Number.parseInt(options?.maxPedigreeDepth, 10) || HUGGING_FACE_MAX_PEDIGREE_DEPTH, ); if (currentDepth >= maxDepth) { return reference; } const nextAncestryTrail = new Set(ancestryTrail); nextAncestryTrail.add(reference["bom-ref"]); const payload = await fetchHuggingFacePayload( "model", `${reference.group}/${reference.name}`, { ...options, huggingFaceRevision: undefined, revision: undefined, version: undefined, }, ); if (!payload) { return reference; } const quantization = quantizationValueFromConfig(payload?.config?.quantization_config) || quantizationValueFromConfig(payload?.quantization_config) || payload?.cardData?.quantization; const pedigree = await toPedigree( payload, quantization, detectPayloadVariants(payload, quantization), options, nextAncestryTrail, currentDepth + 1, ); if (pedigree) { reference.pedigree = pedigree; } return reference; }; const toPedigree = async ( payload, quantization, variants = [], options = {}, ancestryTrail = new Set(), currentDepth = 1, ) => { const cardData = payload?.cardData || {}; const relation = cardData.base_model_relation; const relatedModelRefs = normalizeBaseModelReferences(cardData, options); const relatedModels = await Promise.all( relatedModelRefs.map((modelRef) => resolvePedigreeComponent(modelRef, options, ancestryTrail, currentDepth), ), ); const filteredRelatedModels = relatedModels.filter(Boolean); if (!filteredRelatedModels.length) { return undefined; } const pedigreeKey = !relation || HUGGING_FACE_ANCESTOR_RELATIONS.has(String(relation).toLowerCase()) ? "ancestors" : "variants"; const pedigree = { [pedigreeKey]: [ ...new Map( filteredRelatedModels.map((component) => [ component["bom-ref"], component, ]), ).values(), ], }; const notes = [ relation ? `Hugging Face relation: ${relation}` : undefined, quantization ? `Quantization: ${quantization}` : undefined, variants.length ? `Detected variants: ${variants.join(", ")}` : undefined, ].filter(Boolean); if (notes.length) { pedigree.notes = notes.join("; "); } return pedigree; }; const toProperties = (assetType, payload) => { const properties = [ { name: "cdx:ai:provider", value: "huggingface" }, { name: "cdx:ai:source", value: "huggingface-api" }, ]; if (assetType === "model") { properties.push({ name: "cdx:ai:kind", value: "model" }); } else if (assetType === "dataset") { properties.push({ name: "cdx:ai:kind", value: "dataset" }); } else if (assetType === "space") { properties.push({ name: "cdx:ai:kind", value: "space" }); } const pipelineTag = payload?.pipeline_tag || payload?.cardData?.pipeline_tag; if (pipelineTag) { properties.push({ name: "cdx:ai:modality", value: String(pipelineTag) }); } const parameterCount = payload?.cardData?.parameters || payload?.config?.num_parameters; if (parameterCount !== undefined && parameterCount !== null) { properties.push({ name: "cdx:ai:parameterCount", value: String(parameterCount), }); } const contextWindow = payload?.config?.max_position_embeddings || payload?.cardData?.context_length; if (contextWindow !== undefined && contextWindow !== null) { properties.push({ name: "cdx:ai:contextWindow", value: String(contextWindow), }); } const quantization = quantizationValueFromConfig(payload?.config?.quantization_config) || quantizationValueFromConfig(payload?.quantization_config) || payload?.cardData?.quantization; if (quantization) { properties.push({ name: "cdx:ai:quantization", value: String(quantization), }); } appendUniqueProperty( properties, "cdx:huggingface:downloads", payload?.downloads, ); appendUniqueProperty( properties, "cdx:huggingface:downloadsAllTime", payload?.downloadsAllTime, ); appendUniqueProperty(properties, "cdx:huggingface:likes", payload?.likes); appendUniqueProperty( properties, "cdx:huggingface:likesRecent", payload?.likesRecent, ); appendUniqueProperty(properties, "cdx:huggingface:gated", payload?.gated); appendUniqueProperty(properties, "cdx:huggingface:private", payload?.private); appendUniqueProperty( properties, "cdx:huggingface:disabled", payload?.disabled, ); appendUniqueProperty( properties, "cdx:huggingface:createdAt", payload?.createdAt, ); appendUniqueProperty( properties, "cdx:huggingface:lastModified", payload?.lastModified, ); appendUniqueProperty( properties, "cdx:huggingface:fileCount", Math.min( normalizeArray(payload?.siblings).length, HUGGING_FACE_MAX_SIBLINGS, ) || undefined, ); appendUniqueProperty( properties, "cdx:huggingface:gatedFieldCount", payload?.cardData?.extra_gated_fields ? Object.keys(payload.cardData.extra_gated_fields).length : undefined, ); appendUniqueProperty( properties, "cdx:huggingface:gatedPromptCustomized", payload?.cardData?.extra_gated_prompt ? "true" : undefined, ); if (assetType === "model") { appendUniqueProperty( properties, "cdx:huggingface:libraryName", payload?.library_name || payload?.cardData?.library_name, ); appendUniqueProperty( properties, "cdx:huggingface:spaceCount", normalizeArray(payload?.spaces).length || undefined, ); appendUniqueProperty( properties, "cdx:huggingface:inferenceProviderCount", normalizeArray(payload?.inferenceProviderMapping).length || undefined, ); for (const mapping of normalizeArray(payload?.inferenceProviderMapping)) { appendUniqueProperty( properties, "cdx:huggingface:inferenceProvider", mapping?.provider, ); appendUniqueProperty( properties, "cdx:huggingface:inferenceTask", mapping?.task, ); appendUniqueProperty( properties, "cdx:huggingface:inferenceStatus", mapping?.status, ); } } else if (assetType === "dataset") { appendUniqueProperty( properties, "cdx:huggingface:previewable", payload?.previewable, ); appendUniqueProperty( properties, "cdx:huggingface:papersWithCodeId", payload?.paperswithcode_id || payload?.cardData?.paperswithcode_id, ); appendUniqueProperty( properties, "cdx:huggingface:citationDetected", payload?.citation ? "true" : undefined, ); appendUniqueProperty( properties, "cdx:huggingface:viewer", payload?.cardData?.viewer, ); for (const taskCategory of normalizeArray( payload?.cardData?.task_categories, )) { appendUniqueProperty( properties, "cdx:huggingface:taskCategory", taskCategory, ); } for (const taskId of normalizeArray(payload?.cardData?.task_ids)) { appendUniqueProperty(properties, "cdx:huggingface:taskId", taskId); } for (const language of normalizeArray(payload?.cardData?.language)) { appendUniqueProperty(properties, "cdx:huggingface:language", language); } for (const language of normalizeArray(payload?.cardData?.language_bcp47)) { appendUniqueProperty( properties, "cdx:huggingface:languageBcp47", language, ); } } else if (assetType === "space") { appendUniqueProperty(properties, "cdx:huggingface:sdk", payload?.sdk); appendUniqueProperty( properties, "cdx:huggingface:subdomain", payload?.subdomain, ); appendUniqueProperty( properties, "cdx:huggingface:runtimeStage", payload?.runtime?.stage, ); appendUniqueProperty( properties, "cdx:huggingface:sdkVersion", payload?.runtime?.sdkVersion, ); appendUniqueProperty( properties, "cdx:huggingface:runtimeHardwareCurrent", payload?.runtime?.hardware?.current, ); appendUniqueProperty( properties, "cdx:huggingface:runtimeHardwareRequested", payload?.runtime?.hardware?.requested, ); appendUniqueProperty( properties, "cdx:huggingface:modelCount", normalizeArray(payload?.models).length || undefined, ); appendUniqueProperty( properties, "cdx:huggingface:datasetCount", normalizeArray(payload?.datasets).length || undefined, ); } for (const variant of detectPayloadVariants(payload, quantization)) { properties.push({ name: "cdx:ai:variant", value: String(variant), }); } return properties; }; const detectPayloadVariants = (payload, quantization) => normalizeDetectedVariants( detectAiModelVariants({ description: toDescription(payload), metadata: [payload?.cardData?.library_name], modelName: payload?.id, quantization, relation: payload?.cardData?.base_model_relation, tags: normalizeTagValues([payload?.tags, payload?.cardData?.tags]), }), ); /** * Check whether remote Hugging Face metadata resolution is enabled. * * @param {Object} [options={}] CLI options * @returns {boolean} true when remote resolution is enabled */ export const isHuggingFaceRemoteEnabled = (options = {}) => Boolean( options?.aiHuggingFaceRemote || options?.resolveHuggingFaceRemote || process.env.CDXGEN_HUGGINGFACE_REMOTE === "true", ); /** * Resolve a Hugging Face model, dataset, or space into a BOM component. * * @param {string} assetType asset type such as model or dataset * @param {string} repoId Hugging Face repository id * @param {Object} [options={}] fetch and header overrides * @returns {Promise<Object|undefined>} resolved BOM component */ export async function fetchHuggingFaceAssetInventory( assetType, repoId, options = {}, ) { const normalizedRepoId = sanitizeHuggingFaceRepoId(repoId); if (!normalizedRepoId) { return undefined; } const normalizedAssetType = ["dataset", "space"].includes(assetType) ? assetType : "model"; const revision = resolveHuggingFaceRevision(options); const accessToken = resolveHuggingFaceAccessToken(options); const useCache = !accessToken; const cacheKey = `${normalizedAssetType}:${normalizedRepoId}:${revision}`; if (useCache) { const cachedInventory = responseCache.get(cacheKey); if (cachedInventory.hit) { return cachedInventory.value; } } try { const payload = await fetchHuggingFacePayload( normalizedAssetType, normalizedRepoId, options, ); if (!payload) { if (useCache) { responseCache.set(cacheKey, undefined); } return undefined; } const slashIndex = normalizedRepoId.indexOf("/"); const quantization = quantizationValueFromConfig(payload?.config?.quantization_config) || quantizationValueFromConfig(payload?.quantization_config) || payload?.cardData?.quantization; const safetensorMetadata = extractSafetensorMetadata(payload); const purl = toHuggingFacePurl( normalizedRepoId, payload?.sha || (revision !== "HEAD" ? revision : payload?.lastModified), repositoryUrlForHuggingFaceAssetType(normalizedAssetType), ); const variants = normalizedAssetType === "model" ? detectPayloadVariants(payload, quantization) : []; const ancestryRoot = purl || toHuggingFaceAssetUrl(normalizedAssetType, normalizedRepoId); const relatedComponents = []; const datasetDependencyRefs = new Set(); const modelDependencyRefs = new Set(); const externalReferences = createHuggingFaceExternalReferences( payload, normalizedAssetType, normalizedRepoId, normalizeArray(payload?.spaces), ); const component = { "bom-ref": purl, type: toComponentType(normalizedAssetType), group: normalizedRepoId.slice(0, slashIndex), name: normalizedRepoId.slice(slashIndex + 1), version: payload?.sha || (revision !== "HEAD" ? revision : payload?.lastModified), purl, description: normalizedAssetType === "dataset" ? datasetDescriptionFromStats(payload) : toDescription(payload), externalReferences, licenses: getLicenses({ license: toLicenseSpec(payload, normalizedAssetType, normalizedRepoId), }), evidence: createRemoteEvidence( "purl", purl, normalizedAssetType, normalizedRepoId, `${HF_BASE_URL}/api/${apiPathForType(normalizedAssetType, normalizedRepoId)}/revision/${encodeURIComponent(revision)}`, revision, ), modelCard: normalizedAssetType === "model" ? toModelCard(payload, (dataset) => { const datasetReference = createDatasetReference(dataset); if (!datasetReference) { return undefined; } relatedComponents.push(datasetReference.component); datasetDependencyRefs.add(datasetReference.component["bom-ref"]); return datasetReference.ref; }) : undefined, pedigree: normalizedAssetType === "model" ? await toPedigree( payload, quantization, variants, options, new Set([ancestryRoot]), ) : undefined, data: normalizedAssetType === "dataset" ? [ { type: "dataset", name: normalizedRepoId, contents: { url: toHuggingFaceAssetUrl( normalizedAssetType, normalizedRepoId, ), }, description: datasetDescriptionFromStats(payload), }, ] : undefined, properties: toProperties(normalizedAssetType, payload), tags: [ ...new Set([ "ai", "huggingface", normalizedAssetType, ...variants, ...normalizeTagValues([payload?.tags, payload?.cardData?.tags]), ]), ].slice(0, HUGGING_FACE_MAX_TAGS), }; if (normalizedAssetType === "dataset") { component.scope = "excluded"; } if (safetensorMetadata.parameterCount !== undefined) { component.properties.push({ name: "cdx:ai:parameterCount", value: String(safetensorMetadata.parameterCount), }); } if (normalizedAssetType === "space") { for (const model of normalizeArray(payload?.models)) { const modelReference = createModelReference(model); if (!modelReference) { continue; } relatedComponents.push(modelReference.component); modelDependencyRefs.add(modelReference.ref); } for (const dataset of normalizeArray(payload?.datasets)) { const datasetReference = createDatasetReference(dataset); if (!datasetReference) { continue; } relatedComponents.push(datasetReference.component); datasetDependencyRefs.add(datasetReference.component["bom-ref"]); } } const inventory = { components: [ component, ...new Map( relatedComponents.map((entry) => [entry["bom-ref"], entry]), ).values(), ], dependencies: datasetDependencyRefs.size ? [ { ref: component["bom-ref"], dependsOn: Array.from( new Set([ ...Array.from(datasetDependencyRefs), ...Array.from(modelDependencyRefs), ]), ).sort(), }, ] : modelDependencyRefs.size ? [ { ref: component["bom-ref"], dependsOn: Array.from(modelDependencyRefs).sort(), }, ] : [], primaryComponent: component, }; if (useCache) { responseCache.set(cacheKey, inventory); } return inventory; } catch { if (useCache) { responseCache.set(cacheKey, undefined); } return undefined; } } /** * Resolve a Hugging Face asset to the primary CycloneDX component only. * * @param {string} assetType asset type such as model or dataset * @param {string} repoId Hugging Face repository id * @param {Object} [options={}] fetch and header overrides * @returns {Promise<Object|undefined>} primary resolved component */ export async function fetchHuggingFaceAssetMetadata( assetType, repoId, options = {}, ) { const inventory = await fetchHuggingFaceAssetInventory( assetType, repoId, options, ); return inventory?.primaryComponent; }