@cyclonedx/cdxgen
Version:
Creates CycloneDX Software Bill of Materials (SBOM) from source or container image
786 lines (754 loc) • 25.1 kB
JavaScript
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
import os from "node:os";
import { basename, join } from "node:path";
import { assert, describe, it } from "poku";
import {
collectHuggingFaceRepoAiInventory,
collectJsAiInventory,
collectNotebookAiInventory,
collectPromptConfigAiInventory,
collectPythonAiInventory,
} from "./aiCollector.js";
const createTempDir = () =>
mkdtempSync(join(os.tmpdir(), "cdxgen-ai-collector-"));
const getProp = (subject, name) =>
subject?.properties?.find((property) => property.name === name)?.value;
const GGUF_METADATA_TYPES = {
ARRAY: 9,
STRING: 8,
UINT32: 4,
UINT64: 10,
};
const writeMetadataValue = (chunks, entry, writers) => {
if (entry.type === GGUF_METADATA_TYPES.ARRAY) {
writers.pushU32(entry.itemType);
writers.pushU64(entry.value.length);
for (const item of entry.value) {
writeMetadataValue(
chunks,
{
type: entry.itemType,
value: item,
},
writers,
);
}
return;
}
switch (entry.type) {
case GGUF_METADATA_TYPES.STRING:
writers.pushString(entry.value);
return;
case GGUF_METADATA_TYPES.UINT32:
writers.pushU32(entry.value);
return;
case GGUF_METADATA_TYPES.UINT64:
writers.pushU64(entry.value);
return;
default:
throw new Error(`Unsupported GGUF test metadata type ${entry.type}`);
}
};
const writeGgufFixture = (filePath, metadataEntries = []) => {
const chunks = [];
const pushU32 = (value) => {
const buffer = Buffer.alloc(4);
buffer.writeUInt32LE(value);
chunks.push(buffer);
};
const pushU64 = (value) => {
const buffer = Buffer.alloc(8);
buffer.writeBigUInt64LE(BigInt(value));
chunks.push(buffer);
};
const pushString = (value) => {
const buffer = Buffer.from(value, "utf-8");
pushU64(buffer.length);
chunks.push(buffer);
};
const pushKeyValue = (key, type, writer) => {
pushString(key);
pushU32(type);
writer();
};
const writers = {
pushString,
pushU32,
pushU64,
};
chunks.push(Buffer.from("GGUF"));
pushU32(3);
pushU64(0);
pushU64(metadataEntries.length);
for (const entry of metadataEntries) {
pushKeyValue(entry.key, entry.type, () =>
writeMetadataValue(chunks, entry, writers),
);
}
writeFileSync(filePath, Buffer.concat(chunks));
};
describe("aiCollector", () => {
it("collects JavaScript AI services, model references, Modelfiles, and GGUF assets", () => {
const tmpDir = createTempDir();
try {
mkdirSync(join(tmpDir, "src"), { recursive: true });
writeFileSync(
join(tmpDir, "src", "index.ts"),
[
'import OpenAI from "openai";',
'import { InferenceClient } from "@huggingface/inference";',
'import { pipeline } from "@huggingface/transformers";',
'import "langchain";',
'const model = "gpt-4o-mini";',
'const repo_id = "openai/whisper-small";',
'const client = new InferenceClient("sentence-transformers/all-MiniLM-L6-v2");',
'await fetch("https://api.openai.com/v1/responses");',
'await fetch("https://huggingface.co/datasets/argilla/databricks-dolly-15k");',
'pipeline("text-generation", "openai/whisper-small");',
'const mixtralArtifact = "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/resolve/main/Mixtral-8x7B-Instruct-v0.1-Q5_K_M.gguf";',
].join("\n"),
);
writeFileSync(
join(tmpDir, "Modelfile"),
"FROM llama3.2\nPARAMETER temperature 0.1\nLICENSE Apache-2.0\n",
);
const ggufPath = join(
tmpDir,
"Mixtral-8x7B-Instruct-v0.1-Q5_K_M-00001-of-00002.gguf",
);
writeGgufFixture(ggufPath, [
{
key: "general.name",
type: GGUF_METADATA_TYPES.STRING,
value: "Mixtral-8x7B-Instruct",
},
{
key: "general.license",
type: GGUF_METADATA_TYPES.STRING,
value: "Apache-2.0",
},
{
key: "general.architecture",
type: GGUF_METADATA_TYPES.STRING,
value: "llama",
},
{
key: "general.basename",
type: GGUF_METADATA_TYPES.STRING,
value: "Mixtral",
},
{
key: "general.size_label",
type: GGUF_METADATA_TYPES.STRING,
value: "8x7B",
},
{
key: "general.finetune",
type: GGUF_METADATA_TYPES.STRING,
value: "Instruct",
},
{
key: "general.version",
type: GGUF_METADATA_TYPES.STRING,
value: "v0.1",
},
{
key: "general.organization",
type: GGUF_METADATA_TYPES.STRING,
value: "mistralai",
},
{
key: "general.repo_url",
type: GGUF_METADATA_TYPES.STRING,
value: "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1",
},
{
key: "general.base_model.count",
type: GGUF_METADATA_TYPES.UINT32,
value: 1,
},
{
key: "general.base_model.0.repo_url",
type: GGUF_METADATA_TYPES.STRING,
value: "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2",
},
{
key: "general.base_model.0.name",
type: GGUF_METADATA_TYPES.STRING,
value: "Mistral-7B-Instruct-v0.2",
},
{
key: "general.base_model.0.organization",
type: GGUF_METADATA_TYPES.STRING,
value: "mistralai",
},
{
key: "general.base_model.0.version",
type: GGUF_METADATA_TYPES.STRING,
value: "v0.2",
},
{
key: "general.quantization_version",
type: GGUF_METADATA_TYPES.UINT32,
value: 2,
},
{
key: "general.alignment",
type: GGUF_METADATA_TYPES.UINT32,
value: 64,
},
{
key: "general.tags",
type: GGUF_METADATA_TYPES.ARRAY,
itemType: GGUF_METADATA_TYPES.STRING,
value: ["mixture-of-experts", "gguf", "text-generation"],
},
{
key: "general.languages",
type: GGUF_METADATA_TYPES.ARRAY,
itemType: GGUF_METADATA_TYPES.STRING,
value: ["en", "fr"],
},
{
key: "general.datasets",
type: GGUF_METADATA_TYPES.ARRAY,
itemType: GGUF_METADATA_TYPES.STRING,
value: [
"https://huggingface.co/datasets/mistralai/mixtral-pretrain",
"internal-curated-corpus",
],
},
{
key: "tokenizer.ggml.model",
type: GGUF_METADATA_TYPES.STRING,
value: "llama",
},
{
key: "tokenizer.ggml.tokens",
type: GGUF_METADATA_TYPES.ARRAY,
itemType: GGUF_METADATA_TYPES.STRING,
value: ["<s>", "</s>", "hello", "world"],
},
{
key: "tokenizer.ggml.scores",
type: GGUF_METADATA_TYPES.ARRAY,
itemType: GGUF_METADATA_TYPES.UINT32,
value: [1, 2, 3, 4],
},
{
key: "tokenizer.ggml.token_type",
type: GGUF_METADATA_TYPES.ARRAY,
itemType: GGUF_METADATA_TYPES.UINT32,
value: [3, 3, 1, 1],
},
{
key: "tokenizer.ggml.merges",
type: GGUF_METADATA_TYPES.ARRAY,
itemType: GGUF_METADATA_TYPES.STRING,
value: ["h e", "he llo"],
},
{
key: "tokenizer.ggml.added_tokens",
type: GGUF_METADATA_TYPES.ARRAY,
itemType: GGUF_METADATA_TYPES.STRING,
value: ["<tool_call>"],
},
{
key: "tokenizer.ggml.bos_token_id",
type: GGUF_METADATA_TYPES.UINT32,
value: 1,
},
{
key: "tokenizer.ggml.eos_token_id",
type: GGUF_METADATA_TYPES.UINT32,
value: 2,
},
{
key: "tokenizer.ggml.padding_token_id",
type: GGUF_METADATA_TYPES.UINT32,
value: 0,
},
{
key: "tokenizer.chat_template",
type: GGUF_METADATA_TYPES.STRING,
value:
"{% for message in messages %}{{ message['content'] }}{% endfor %}",
},
{
key: "tokenizer.huggingface.json",
type: GGUF_METADATA_TYPES.STRING,
value: '{"version":"1.0"}',
},
{
key: "llama.context_length",
type: GGUF_METADATA_TYPES.UINT64,
value: 32768,
},
{
key: "general.file_type",
type: GGUF_METADATA_TYPES.UINT32,
value: 17,
},
]);
const inventory = collectJsAiInventory(tmpDir, {});
const openAiService = inventory.services.find(
(service) => service.group === "openai",
);
const gptModel = inventory.components.find(
(component) => component.name === "gpt-4o-mini",
);
const hfDataset = inventory.components.find(
(component) =>
component.group === "argilla" &&
component.name === "databricks-dolly-15k",
);
const modelfileModel = inventory.components.find((component) =>
component.properties?.some(
(property) =>
property.name === "cdx:ai:artifactFormat" &&
property.value === "modelfile",
),
);
const ggufModel = inventory.components.find(
(component) =>
component.name === "Mixtral-8x7B-Instruct" &&
component.properties?.some(
(property) =>
property.name === "cdx:ai:artifactFormat" &&
property.value === "gguf",
),
);
const ggufFile = inventory.components.find(
(component) =>
component.type === "file" && component.name === basename(ggufPath),
);
const remoteGgufModel = inventory.components.find(
(component) =>
component.purl ===
"pkg:huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1",
);
assert.ok(openAiService, "expected OpenAI service");
assert.ok(gptModel, "expected OpenAI model component");
assert.ok(hfDataset, "expected Hugging Face dataset component");
assert.ok(modelfileModel, "expected Modelfile-derived model component");
assert.ok(ggufModel, "expected GGUF-derived model component");
assert.ok(ggufFile, "expected GGUF file component");
assert.ok(
remoteGgufModel,
"expected Hugging Face model component from standard GGUF artifact URL",
);
assert.ok(
openAiService.properties.some(
(property) =>
property.name === "cdx:ai:modelId" &&
property.value === "gpt-4o-mini",
),
);
assert.ok(Number(getProp(openAiService, "cdx:ai:modelCount")) >= 1);
assert.strictEqual(
getProp(openAiService, "cdx:ai:modelSelection"),
"explicit",
);
assert.strictEqual(getProp(openAiService, "cdx:ai:deployment"), "remote");
assert.strictEqual(
getProp(openAiService, "cdx:ai:transportSecurity"),
"https",
);
assert.ok(
inventory.dependencies.some(
(dependency) =>
dependency.ref === openAiService["bom-ref"] &&
dependency.dependsOn?.includes(gptModel["bom-ref"]),
),
);
assert.ok(
hfDataset.externalReferences?.some((reference) =>
reference.url.includes(
"huggingface.co/datasets/argilla/databricks-dolly-15k",
),
),
);
assert.ok(
ggufModel.properties.some(
(property) =>
property.name === "cdx:ai:contextWindow" &&
property.value === "32768",
),
);
assert.strictEqual(getProp(ggufModel, "cdx:ai:quantization"), "Q5_K_M");
assert.strictEqual(getProp(ggufModel, "cdx:gguf:sizeLabel"), "8x7B");
assert.strictEqual(
getProp(ggufModel, "cdx:gguf:tokenizerModel"),
"llama",
);
assert.strictEqual(
getProp(ggufModel, "cdx:gguf:tokenizerTokenCount"),
"4",
);
assert.strictEqual(
getProp(ggufModel, "cdx:gguf:tokenizerMergeCount"),
"2",
);
assert.strictEqual(
getProp(ggufModel, "cdx:gguf:tokenizerAddedTokenCount"),
"1",
);
assert.strictEqual(
getProp(ggufModel, "cdx:gguf:chatTemplateDetected"),
"true",
);
assert.strictEqual(
getProp(ggufModel, "cdx:gguf:huggingFaceTokenizer"),
"true",
);
assert.strictEqual(getProp(ggufModel, "cdx:gguf:bosTokenId"), "1");
assert.strictEqual(getProp(ggufModel, "cdx:gguf:paddingTokenId"), "0");
assert.strictEqual(ggufModel.version, "v0.1");
assert.strictEqual(
ggufModel.modelCard.modelParameters.architectureFamily,
"llama",
);
assert.strictEqual(
ggufModel.modelCard.modelParameters.task,
"text-generation",
);
assert.strictEqual(
ggufModel.modelCard.modelParameters.datasets[0].contents.url,
"https://huggingface.co/datasets/mistralai/mixtral-pretrain",
);
assert.strictEqual(
ggufModel.modelCard.modelParameters.datasets[1].name,
"internal-curated-corpus",
);
assert.strictEqual(
ggufModel.modelCard.modelParameters.inputs[0].format,
"text",
);
assert.strictEqual(
ggufModel.modelCard.modelParameters.outputs[0].format,
"text",
);
assert.strictEqual(
ggufModel.pedigree.ancestors[0].purl,
"pkg:huggingface/mistralai/Mistral-7B-Instruct-v0.2",
);
assert.ok(
ggufModel.externalReferences.some(
(reference) =>
reference.type === "vcs" &&
reference.url ===
"https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1",
),
);
assert.strictEqual(getProp(ggufFile, "cdx:gguf:shard"), "00001-of-00002");
assert.strictEqual(getProp(ggufFile, "cdx:gguf:alignment"), "64");
assert.strictEqual(
getProp(ggufFile, "cdx:gguf:chatTemplateDetected"),
"true",
);
assert.strictEqual(
getProp(remoteGgufModel, "cdx:ai:quantization"),
"Q5_K_M",
);
} finally {
rmSync(tmpDir, { force: true, recursive: true });
}
});
it("does not emit out-of-range GGUF alignment values", () => {
const tmpDir = createTempDir();
try {
writeFileSync(join(tmpDir, "index.js"), "console.log('ok');\n");
const ggufPath = join(tmpDir, "test-model.Q5_K_M.gguf");
writeGgufFixture(ggufPath, [
{
key: "general.name",
type: GGUF_METADATA_TYPES.STRING,
value: "test-model",
},
{
key: "general.alignment",
type: GGUF_METADATA_TYPES.UINT64,
value: 1048577,
},
]);
const inventory = collectJsAiInventory(tmpDir, {});
const ggufFile = inventory.components.find(
(component) =>
component.type === "file" && component.name === basename(ggufPath),
);
assert.ok(ggufFile);
assert.strictEqual(getProp(ggufFile, "cdx:gguf:alignment"), undefined);
} finally {
rmSync(tmpDir, { force: true, recursive: true });
}
});
it("does not emit malformed string GGUF alignment values", () => {
const tmpDir = createTempDir();
try {
writeFileSync(join(tmpDir, "index.js"), "console.log('ok');\n");
const ggufPath = join(tmpDir, "test-model.Q5_K_M.gguf");
writeGgufFixture(ggufPath, [
{
key: "general.name",
type: GGUF_METADATA_TYPES.STRING,
value: "test-model",
},
{
key: "general.alignment",
type: GGUF_METADATA_TYPES.STRING,
value: "64evil",
},
]);
const inventory = collectJsAiInventory(tmpDir, {});
const ggufFile = inventory.components.find(
(component) =>
component.type === "file" && component.name === basename(ggufPath),
);
assert.ok(ggufFile);
assert.strictEqual(getProp(ggufFile, "cdx:gguf:alignment"), undefined);
} finally {
rmSync(tmpDir, { force: true, recursive: true });
}
});
it("ignores string literals that only look like AI SDK imports", () => {
const tmpDir = createTempDir();
try {
writeFileSync(
join(tmpDir, "index.js"),
'const msg = "import { OpenAI } from \'openai\'";\nconst note = "from openai import OpenAI";\n',
);
const inventory = collectJsAiInventory(tmpDir, {});
assert.strictEqual(inventory.components.length, 0);
assert.strictEqual(inventory.services.length, 0);
} finally {
rmSync(tmpDir, { force: true, recursive: true });
}
});
it("collects GitHub-derived sample app fixtures with Hugging Face artifact details", () => {
const localpilotInventory = collectPythonAiInventory(
"./test/data/ai-huggingface/github-apps/localpilot",
{},
);
const heavenBanBotInventory = collectPythonAiInventory(
"./test/data/ai-huggingface/github-apps/heaven-ban-bot",
{},
);
const lobeVidolInventory = collectJsAiInventory(
"./test/data/ai-huggingface/github-apps/lobe-vidol",
{},
);
const localpilotModel = localpilotInventory.components.find(
(component) => component.group === "TheBloke",
);
const heavenBanBotModel = heavenBanBotInventory.components.find(
(component) => component.group === "meta-llama",
);
assert.ok(localpilotModel, "expected model from localpilot fixture");
assert.strictEqual(
getProp(localpilotModel, "cdx:ai:artifactFormat"),
"gguf",
);
assert.strictEqual(
getProp(localpilotModel, "cdx:ai:quantization"),
"Q5_K_S",
);
assert.ok(heavenBanBotModel, "expected model from heaven-ban-bot fixture");
assert.strictEqual(heavenBanBotModel.name, "Llama-2-7b-chat-hf");
assert.ok(
lobeVidolInventory.services.some(
(service) => service.group === "huggingface",
),
"expected Hugging Face service from lobe-vidol fixture",
);
});
it("collects local Hugging Face repository metadata into pedigree and model cards", () => {
const inventory = collectHuggingFaceRepoAiInventory(
"./test/data/ai-huggingface/repos",
{},
);
const model = inventory.components.find(
(component) =>
component.type === "machine-learning-model" &&
component.group === "HuggingFaceH4",
);
const dataset = inventory.components.find(
(component) =>
component.type === "data" &&
component.group === "HuggingFaceH4" &&
component.name === "ultrachat_200k",
);
assert.ok(model, "expected local Hugging Face repo model");
assert.strictEqual(model.name, "zephyr-7b-beta");
assert.strictEqual(model.pedigree.ancestors[0].group, "mistralai");
assert.strictEqual(model.modelCard.modelParameters.task, "text-generation");
assert.strictEqual(
model.modelCard.modelParameters.datasets[0].ref,
"pkg:huggingface/HuggingFaceH4/ultrachat_200k?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets",
);
assert.strictEqual(
model.modelCard.modelParameters.inputs[0].format,
"text",
);
assert.strictEqual(
model.modelCard.modelParameters.outputs[0].format,
"text",
);
assert.ok(dataset, "expected referenced dataset component");
assert.strictEqual(
dataset.purl,
"pkg:huggingface/HuggingFaceH4/ultrachat_200k?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets",
);
assert.strictEqual(
model.modelCard.quantitativeAnalysis.performanceMetrics[0].type,
"MT-Bench",
);
assert.strictEqual(getProp(model, "cdx:ai:quantization"), "bnb 4-bit");
assert.match(model.pedigree.notes, /adapter/u);
assert.match(model.pedigree.notes, /quantized/u);
assert.ok(
model.modelCard.properties.some(
(property) =>
property.name === "cdx:huggingface:language" &&
property.value === "en",
),
);
assert.ok(
inventory.dependencies.some(
(dependency) =>
dependency.ref === model["bom-ref"] &&
dependency.dependsOn?.includes(dataset["bom-ref"]),
),
);
});
it("sanitizes local Hugging Face model-card dataset URLs before emitting BOM data", () => {
const tmpDir = createTempDir();
try {
const repoDir = join(tmpDir, "team--model");
mkdirSync(repoDir, { recursive: true });
writeFileSync(
join(repoDir, "README.md"),
[
"---",
"modelId: team/model",
"library_name: transformers",
"datasets:",
" - name: team/dataset",
" url: https://huggingface.co/datasets/team/dataset?download=1#fragment",
"---",
"",
"# team/model",
].join("\n"),
);
writeFileSync(
join(repoDir, "config.json"),
JSON.stringify({
model_type: "llama",
architectures: ["LlamaForCausalLM"],
}),
);
const inventory = collectHuggingFaceRepoAiInventory(tmpDir, {});
const model = inventory.components.find(
(component) => component.group === "team" && component.name === "model",
);
assert.ok(model, "expected sanitized local Hugging Face model");
const dataset = inventory.components.find(
(component) =>
component.type === "data" &&
component.group === "team" &&
component.name === "dataset",
);
assert.ok(dataset, "expected referenced dataset component");
assert.strictEqual(
model.modelCard.modelParameters.datasets[0].ref,
"pkg:huggingface/team/dataset?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets",
);
assert.strictEqual(
dataset.externalReferences[0].url,
"https://huggingface.co/datasets/team/dataset",
);
assert.strictEqual(
dataset.purl,
"pkg:huggingface/team/dataset?repository_url=https%3A%2F%2Fhuggingface.co%2Fdatasets",
);
} finally {
rmSync(tmpDir, { force: true, recursive: true });
}
});
it("collects Python, notebook, and prompt-config AI signals with file relationships", () => {
const tmpDir = createTempDir();
try {
mkdirSync(join(tmpDir, "prompts"), { recursive: true });
writeFileSync(
join(tmpDir, "app.py"),
[
"from openai import OpenAI",
"from langchain_openai import ChatOpenAI",
"client = OpenAI()",
'model_name = "gpt-4.1-mini"',
'endpoint = "https://api.openai.com/v1/responses"',
].join("\n"),
);
writeFileSync(
join(tmpDir, "analysis.ipynb"),
JSON.stringify({
cells: [
{
cell_type: "code",
source: [
"import anthropic\n",
'model = "claude-3-7-sonnet"\n',
'url = "https://api.anthropic.com/v1/messages"\n',
],
},
],
}),
);
writeFileSync(
join(tmpDir, "prompts", "system-prompt.yaml"),
[
"provider: openai",
"model: gpt-4o-mini",
"endpoint: https://api.openai.com/v1/chat/completions",
].join("\n"),
);
const pythonInventory = collectPythonAiInventory(tmpDir, {});
const notebookInventory = collectNotebookAiInventory(tmpDir, {});
const promptInventory = collectPromptConfigAiInventory(tmpDir, {});
assert.ok(
pythonInventory.components.some(
(component) => component.name === "gpt-4.1-mini",
),
);
assert.ok(
notebookInventory.components.some((component) =>
component.properties?.some(
(property) =>
property.name === "cdx:file:kind" &&
property.value === "notebook-file",
),
),
);
const promptFile = promptInventory.components.find((component) =>
component.properties?.some(
(property) =>
property.name === "cdx:file:kind" &&
property.value === "prompt-config-file",
),
);
const promptModel = promptInventory.components.find(
(component) => component.name === "gpt-4o-mini",
);
assert.ok(promptFile, "expected prompt config file component");
assert.ok(promptModel, "expected prompt config model component");
assert.ok(
promptInventory.dependencies.some(
(dependency) =>
dependency.ref === promptFile["bom-ref"] &&
dependency.dependsOn?.includes(promptModel["bom-ref"]),
),
);
} finally {
rmSync(tmpDir, { force: true, recursive: true });
}
});
});