UNPKG

@cyclonedx/cdxgen

Version:

Creates CycloneDX Software Bill of Materials (SBOM) from source or container image

478 lines (460 loc) 14.6 kB
import process from "node:process"; import esmock from "esmock"; import { assert, describe, it } from "poku"; import sinon from "sinon"; import { quantizationValueFromConfig } from "../huggingfaceUtils.js"; import { normalizeHuggingFaceReference, toHuggingFacePurl, } from "./huggingface.js"; const HUGGING_FACE_TOKEN_ENV_KEYS = [ "HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "HUGGINGFACE_TOKEN", ]; const withClearedHuggingFaceTokenEnv = async (callback) => { const previousEnv = new Map(); for (const envKey of HUGGING_FACE_TOKEN_ENV_KEYS) { previousEnv.set(envKey, process.env[envKey]); delete process.env[envKey]; } try { return await callback(); } finally { for (const [envKey, envValue] of previousEnv.entries()) { if (envValue === undefined) { delete process.env[envKey]; } else { process.env[envKey] = envValue; } } } }; describe("huggingface remote helper", () => { it("normalizes direct repo ids and Hugging Face URLs", () => { assert.deepStrictEqual( normalizeHuggingFaceReference("openai/whisper-small"), { assetType: "model", repoId: "openai/whisper-small", }, ); assert.deepStrictEqual( normalizeHuggingFaceReference( "https://huggingface.co/datasets/argilla/databricks-dolly-15k", ), { assetType: "dataset", repoId: "argilla/databricks-dolly-15k", }, ); assert.deepStrictEqual( normalizeHuggingFaceReference( "pkg:huggingface/openai/whisper-small@ABC123", ), { assetType: "model", repoId: "openai/whisper-small", version: "ABC123", }, ); assert.deepStrictEqual( normalizeHuggingFaceReference( "https://huggingface.co/api/models/openai/whisper-small/revision/refs%2Fpr%2F7", ), { assetType: "model", repoId: "openai/whisper-small", version: "refs/pr/7", }, ); assert.strictEqual( normalizeHuggingFaceReference("/tmp/cdxgen-ai-inventory-1234"), undefined, ); }); it("creates PackageURL-based Hugging Face purls", () => { assert.strictEqual( toHuggingFacePurl("HuggingFaceH4/zephyr-7b-beta", "ABC123"), "pkg:huggingface/HuggingFaceH4/zephyr-7b-beta@abc123", ); assert.strictEqual( toHuggingFacePurl("HuggingFaceH4/ultrachat_200k"), "pkg:huggingface/HuggingFaceH4/ultrachat_200k", ); assert.strictEqual( toHuggingFacePurl( "HuggingFaceH4/ultrachat_200k", undefined, "https://huggingface.co/datasets", ), "pkg:huggingface/HuggingFaceH4/ultrachat_200k?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets", ); }); it("derives readable quantization labels from config objects", () => { assert.strictEqual( quantizationValueFromConfig({ quant_method: "bnb", load_in_4bit: true }), "bnb 4-bit", ); assert.strictEqual( quantizationValueFromConfig({ quant_type: "nf4", bits: 8 }), "nf4 8-bit", ); assert.strictEqual( quantizationValueFromConfig("gguf-q5_k_m"), "gguf-q5_k_m", ); }); it("fetches Hugging Face inventory, links dataset components, and honors cache reset", async () => { const getStub = sinon.stub().callsFake(async (url) => { if (url.includes("mistralai/Mistral-7B-v0.1")) { return { body: { id: "mistralai/Mistral-7B-v0.1", sha: "BASE123", cardData: {}, config: {}, likes: 1, downloads: 1, gated: false, private: false, }, }; } return { body: { id: "HuggingFaceH4/zephyr-7b-beta", sha: "ABC123", author: "HuggingFaceH4", arxivIds: ["2401.00001"], description: "Helpful assistant model", disabled: false, downloads: 1234, downloadsAllTime: 67890, gated: false, inferenceProviderMapping: [ { provider: "hf-inference", status: "live", task: "text-generation", }, ], lastModified: "2025-01-01T00:00:00.000Z", library_name: "transformers", likes: 99, likesRecent: 12, private: false, spaces: ["HuggingFaceH4/zephyr-chat"], tags: ["chat"], doi: { id: "10.5555/example-doi", commit: "abc123" }, siblings: [{ rfilename: "README.md" }, { rfilename: "LICENSE" }], cardData: { base_model: ["mistralai/Mistral-7B-v0.1"], base_model_relation: "adapter", datasets: [ { config: "default", name: "HuggingFaceH4/ultrachat_200k", split: "train", }, ], extra_gated_fields: { company: "text" }, extra_gated_prompt: "Research access request", language: ["en"], license: "Apache-2.0", mask_token: "<mask>", "model-index": [ { results: [ { dataset: { name: "HuggingFaceH4/ultrachat_200k", split: "train", }, metrics: [{ type: "MT-Bench", value: 7.5 }], }, ], }, ], pipeline_tag: "text-generation", tags: ["summarization"], widget: [ { messages: [{ role: "user", content: "Hello" }], output: { text: "Hi" }, }, ], }, config: { architectures: ["LlamaForCausalLM"], model_type: "llama", quantization_config: { load_in_4bit: true, quant_method: "bnb", }, }, }, }; }); const { fetchHuggingFaceAssetInventory, resetHuggingFaceRemoteCaches } = await withClearedHuggingFaceTokenEnv(async () => esmock("./huggingface.js", { "../utils.js": { cdxgenAgent: { get: getStub }, getLicenses: ({ license }) => license ? [ { license: { id: Array.isArray(license) ? license[0]?.type : license, }, }, ] : undefined, isDryRun: false, recordActivity: sinon.stub(), }, }), ); resetHuggingFaceRemoteCaches(); const inventory = await fetchHuggingFaceAssetInventory( "model", "HuggingFaceH4/zephyr-7b-beta", {}, ); const cachedInventory = await fetchHuggingFaceAssetInventory( "model", "HuggingFaceH4/zephyr-7b-beta", {}, ); assert.ok(inventory, "expected remote inventory"); assert.strictEqual(getStub.callCount, 2); assert.match( getStub.firstCall.args[0], /\/api\/models\/HuggingFaceH4\/zephyr-7b-beta\/revision\/HEAD\?/u, ); assert.match(getStub.firstCall.args[0], /expand=downloadsAllTime/u); assert.strictEqual( cachedInventory?.primaryComponent?.name, "zephyr-7b-beta", ); assert.strictEqual( inventory.primaryComponent.purl, "pkg:huggingface/HuggingFaceH4/zephyr-7b-beta@abc123", ); assert.strictEqual( inventory.primaryComponent.modelCard.modelParameters.datasets[0].ref, "pkg:huggingface/HuggingFaceH4/ultrachat_200k?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets", ); assert.ok( inventory.components.some( (component) => component.type === "data" && component.group === "HuggingFaceH4" && component.name === "ultrachat_200k" && component.purl === "pkg:huggingface/HuggingFaceH4/ultrachat_200k?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets", ), ); assert.deepStrictEqual(inventory.dependencies, [ { ref: "pkg:huggingface/HuggingFaceH4/zephyr-7b-beta@abc123", dependsOn: [ "pkg:huggingface/HuggingFaceH4/ultrachat_200k?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets", ], }, ]); assert.ok( inventory.primaryComponent.properties.some( (property) => property.name === "cdx:ai:quantization" && property.value === "bnb 4-bit", ), ); assert.ok( inventory.primaryComponent.properties.some( (property) => property.name === "cdx:huggingface:downloadsAllTime" && property.value === "67890", ), ); assert.ok( inventory.primaryComponent.properties.some( (property) => property.name === "cdx:huggingface:inferenceProvider" && property.value === "hf-inference", ), ); assert.ok( inventory.primaryComponent.modelCard.properties.some( (property) => property.name === "cdx:huggingface:maskToken" && property.value === "<mask>", ), ); assert.strictEqual( inventory.primaryComponent.modelCard.modelParameters.inputs[0].format, "text", ); assert.ok( inventory.primaryComponent.externalReferences.some( (reference) => reference.type === "citation" && reference.url === "https://doi.org/10.5555/example-doi", ), ); resetHuggingFaceRemoteCaches(); await fetchHuggingFaceAssetInventory( "model", "HuggingFaceH4/zephyr-7b-beta", {}, ); assert.strictEqual(getStub.callCount, 4); }); it("resolves Hugging Face spaces into application components with model and dataset dependencies", async () => { const getStub = sinon.stub().resolves({ body: { id: "team/demo-space", sha: "SPACE123", createdAt: "2025-02-01T00:00:00.000Z", lastModified: "2025-02-02T00:00:00.000Z", likes: 42, private: false, sdk: "gradio", subdomain: "team-demo-space", datasets: ["HuggingFaceH4/ultrachat_200k"], models: ["HuggingFaceH4/zephyr-7b-beta"], runtime: { stage: "RUNNING", sdkVersion: "5.0.0", hardware: { current: "cpu-basic", requested: "cpu-basic" }, }, tags: ["chatbot"], }, }); const { fetchHuggingFaceAssetInventory, resetHuggingFaceRemoteCaches } = await withClearedHuggingFaceTokenEnv(async () => esmock("./huggingface.js", { "../utils.js": { cdxgenAgent: { get: getStub }, getLicenses: ({ license }) => license ? [ { license: { id: Array.isArray(license) ? license[0]?.type : license, }, }, ] : undefined, isDryRun: false, recordActivity: sinon.stub(), }, }), ); resetHuggingFaceRemoteCaches(); const inventory = await fetchHuggingFaceAssetInventory( "space", "team/demo-space", {}, ); assert.ok(inventory, "expected space inventory"); assert.strictEqual(inventory.primaryComponent.type, "application"); assert.ok( inventory.primaryComponent.properties.some( (property) => property.name === "cdx:huggingface:runtimeStage" && property.value === "RUNNING", ), ); assert.ok( inventory.components.some( (component) => component.type === "machine-learning-model" && component.purl === "pkg:huggingface/HuggingFaceH4/zephyr-7b-beta", ), ); assert.ok( inventory.components.some( (component) => component.type === "data" && component.purl === "pkg:huggingface/HuggingFaceH4/ultrachat_200k?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets", ), ); assert.deepStrictEqual(inventory.dependencies, [ { ref: "pkg:huggingface/team/demo-space@space123?repository_url=https%3A%2F%2Fhuggingface.co%2Fspaces", dependsOn: [ "pkg:huggingface/HuggingFaceH4/ultrachat_200k?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets", "pkg:huggingface/HuggingFaceH4/zephyr-7b-beta", ], }, ]); }); it("caps pedigree expansion and tag/property bloat from remote payloads", async () => { const baseModels = Array.from( { length: 30 }, (_, index) => `org/base-${index}`, ); const getStub = sinon.stub().callsFake(async (url) => { const modelRef = decodeURIComponent( url.split("/models/")[1].split("/revision/")[0], ); if (modelRef.startsWith("org/base-")) { return { body: { id: modelRef, sha: `sha-${modelRef}`, cardData: {}, }, }; } return { body: { id: "org/root-model", sha: "ROOT123", siblings: Array.from({ length: 12000 }, (_, index) => ({ rfilename: `weights-${index}.bin`, })), tags: Array.from({ length: 400 }, (_, index) => `tag-${index}`), cardData: { base_models: baseModels, tags: [`x${"y".repeat(200)}`], }, }, }; }); const { fetchHuggingFaceAssetInventory, resetHuggingFaceRemoteCaches } = await withClearedHuggingFaceTokenEnv(async () => esmock("./huggingface.js", { "../utils.js": { cdxgenAgent: { get: getStub }, getLicenses: () => undefined, isDryRun: false, recordActivity: sinon.stub(), }, }), ); resetHuggingFaceRemoteCaches(); const inventory = await fetchHuggingFaceAssetInventory( "model", "org/root-model", {}, ); assert.ok(inventory); assert.strictEqual(getStub.callCount, 21); assert.ok(inventory.primaryComponent.tags.length <= 256); assert.ok( inventory.primaryComponent.tags.every((tag) => tag.length <= 128), ); assert.strictEqual( inventory.primaryComponent.properties.some( (property) => property.name === "cdx:huggingface:fileCount" && property.value === "10000", ), true, ); }); });