UNPKG

@cyclonedx/cdxgen

Version:

Creates CycloneDX Software Bill of Materials (SBOM) from source or container image

1,692 lines (1,629 loc) 64.7 kB
import { readFileSync, statSync } from "node:fs"; import { basename, dirname, extname, relative } from "node:path"; import { ggufFileTypeName, parseGgufFilename, readGgufMetadata, } from "../parsers/gguf.js"; import { createHuggingFaceDatasetReference, createHuggingFaceModelCard, createHuggingFacePedigree, HUGGING_FACE_ADAPTER_PATTERNS, HUGGING_FACE_CONFIG_PATTERNS, HUGGING_FACE_MODEL_CARD_PATTERNS, hasHuggingFaceCardSignals, parseHuggingFaceReadmeFrontmatter, repoIdFromFixtureDirectory, } from "../parsers/huggingfaceManifest.js"; import { parseOllamaModelfile } from "../parsers/ollama.js"; import { detectAiModelVariants, normalizeDetectedVariants, } from "./aiModelVariants.js"; import { normalizeHuggingFaceReference, quantizationValueFromConfig, repositoryUrlForHuggingFaceAssetType, toHuggingFaceAssetUrl, toHuggingFacePurl, } from "./huggingfaceUtils.js"; import { sanitizeBomPropertyValue, sanitizeBomUrl, sanitizeStructuredValueForBom, } from "./propertySanitizer.js"; import { getAllFiles, getLicenses } from "./utils.js"; // ----------------------------------------------------------------------------- // Section: source patterns, provider fingerprints, and low-level constants // ----------------------------------------------------------------------------- const JS_SOURCE_PATTERNS = ["**/*.{js,jsx,cjs,mjs,ts,tsx,mts,cts,vue,svelte}"]; const PYTHON_SOURCE_PATTERNS = ["**/*.{py,pyw}"]; const NOTEBOOK_SOURCE_PATTERNS = ["**/*.ipynb"]; const SHELL_WRAPPER_PATTERNS = ["**/*.{sh,bash,zsh,command}"]; const PROMPT_CONFIG_PATTERNS = [ "**/*.{prompt,prompt.txt,prompt.md}", "**/{prompt,prompts,agents,ai,assistant,instructions}/**/*.{md,txt,json,jsonc,yaml,yml,toml}", "**/*{prompt,prompts,instruction,instructions,assistant,system,persona,agent,model}*.{md,txt,json,jsonc,yaml,yml,toml}", ]; const MODEFILE_PATTERNS = ["Modelfile", "**/Modelfile", "**/Modelfile.*"]; const GGUF_PATTERNS = ["**/*.gguf"]; const GGUF_MIN_ALIGNMENT = 4; const GGUF_MAX_ALIGNMENT = 1024 * 1024; const IGNORE_SOURCE_FILE_PATTERN = /(^|\/|\\)(__tests__|fixtures?|examples?|samples?)($|\/|\\)|(^|\/|\\)(test|spec|mock|setup-jest|conftest|sitecustomize)\.(js|ts|tsx|py)$|(?<!vite\.|vue\.)(conf|config)\.(js|ts|tsx)$/iu; const HOST_PROVIDER_PATTERNS = [ ["openai", /(?:^|\.)openai\.com$/iu], ["anthropic", /(?:^|\.)anthropic\.com$/iu], ["google", /(?:^|\.)googleapis\.com$/iu], ["google", /(?:^|\.)generativelanguage\.googleapis\.com$/iu], ["azure-openai", /(?:^|\.)openai\.azure\.com$/iu], ["huggingface", /(?:^|\.)huggingface\.co$/iu], ["mistral", /(?:^|\.)mistral\.ai$/iu], ["cohere", /(?:^|\.)cohere\.ai$/iu], ["deepseek", /(?:^|\.)deepseek\.com$/iu], ["groq", /(?:^|\.)groq\.com$/iu], ["fireworks", /(?:^|\.)fireworks\.ai$/iu], ["together", /(?:^|\.)together\.xyz$/iu], ["replicate", /(?:^|\.)replicate\.com$/iu], ["perplexity", /(?:^|\.)perplexity\.ai$/iu], ["vertex-ai", /(?:^|\.)aiplatform\.googleapis\.com$/iu], ["ollama", /(?:^|\.)ollama\.com$/iu], ["ollama", /(?:^|\.)localhost$/iu], ["ollama", /(?:^|\.)127\.0\.0\.1$/iu], ]; const AI_PACKAGE_REGISTRY = [ { pattern: /^openai$/u, provider: "openai", serviceName: "OpenAI API" }, { pattern: /^@anthropic-ai\/sdk$/u, provider: "anthropic", serviceName: "Anthropic API", }, { pattern: /^@google\/genai$/u, provider: "google", serviceName: "Google Generative AI API", }, { pattern: /^@google-ai\/generativelanguage$/u, provider: "google", serviceName: "Google Generative AI API", }, { pattern: /^@azure\/openai$/u, provider: "azure-openai", serviceName: "Azure OpenAI API", }, { pattern: /^@huggingface\/inference$/u, provider: "huggingface", serviceName: "Hugging Face Inference API", }, { pattern: /^@huggingface\/transformers$/u, provider: "huggingface", runtime: "transformers.js", }, { pattern: /^langchain$/u, framework: "langchain" }, { pattern: /^@langchain\//u, framework: "langchain" }, { pattern: /^langgraph$/u, framework: "langgraph" }, { pattern: /^@langchain\/langgraph$/u, framework: "langgraph" }, { pattern: /^ai$/u, framework: "vercel-ai-sdk" }, { pattern: /^@ai-sdk\//u, framework: "vercel-ai-sdk" }, { pattern: /^@openai\/agents(?:-core)?$/u, framework: "openai-agents", provider: "openai", serviceName: "OpenAI API", }, { pattern: /^mastra$/u, framework: "mastra" }, { pattern: /^ollama$/u, provider: "ollama", runtime: "ollama" }, { pattern: /^node-llama-cpp$/u, runtime: "llama.cpp" }, { pattern: /^@mlc-ai\//u, runtime: "mlc" }, { pattern: /^groq-sdk$|^groq$/u, provider: "groq", serviceName: "Groq API" }, { pattern: /^cohere-ai$/u, provider: "cohere", serviceName: "Cohere API", }, { pattern: /^@mistralai\/mistralai$/u, provider: "mistral", serviceName: "Mistral API", }, { pattern: /^@deepseek\/openai$/u, provider: "deepseek", serviceName: "DeepSeek API", }, { pattern: /^google-generativeai$|^google-genai$/u, provider: "google", serviceName: "Google Generative AI API", }, { pattern: /^anthropic$/u, provider: "anthropic", serviceName: "Anthropic API", }, { pattern: /^transformers$|^sentence-transformers$/u, runtime: "transformers", }, { pattern: /^langchain_/u, framework: "langchain" }, { pattern: /^llama-index(?:-|$)/u, framework: "llama-index" }, { pattern: /^litellm$/u, framework: "litellm" }, { pattern: /^autogen(?:-agentchat)?$/u, framework: "autogen" }, { pattern: /^vllm$/u, runtime: "vllm" }, { pattern: /^together$|^together-ai$/u, provider: "together", serviceName: "Together API", }, { pattern: /^fireworks-ai$/u, provider: "fireworks", serviceName: "Fireworks AI API", }, { pattern: /^replicate$/u, provider: "replicate", serviceName: "Replicate API", }, { pattern: /^perplexity(?:ai)?$/u, provider: "perplexity", serviceName: "Perplexity API", }, ]; const serializeAiService = (service) => ({ "bom-ref": service["bom-ref"], group: service.group, name: service.name, provider: service.provider, version: service.version, endpoints: Array.from(service.endpoints) .map((endpoint) => sanitizeBomUrl(endpoint)) .filter(Boolean) .sort(), properties: service.properties, tags: Array.from(service.tags).sort(), evidence: service.occurrences.length ? { occurrences: service.occurrences } : undefined, }); const TEXT_AI_INVENTORY_CONFIGS = { javascript: { patterns: JS_SOURCE_PATTERNS }, notebook: { fileKind: "notebook-file", patterns: NOTEBOOK_SOURCE_PATTERNS, tags: ["notebook"], }, promptConfig: { fileKind: "prompt-config-file", patterns: [...SHELL_WRAPPER_PATTERNS, ...PROMPT_CONFIG_PATTERNS], tags: ["prompt-config"], }, python: { patterns: PYTHON_SOURCE_PATTERNS }, }; // ----------------------------------------------------------------------------- // Section: generic component, service, and text-scanning helpers // ----------------------------------------------------------------------------- const addUniqueProperty = (properties, name, value) => { const sanitizedValue = sanitizeBomPropertyValue(name, value); if ( sanitizedValue === undefined || sanitizedValue === null || sanitizedValue === "" ) { return; } const normalizedValue = String(sanitizedValue); if ( properties.some( (property) => property?.name === name && property?.value === normalizedValue, ) ) { return; } properties.push({ name, value: normalizedValue }); }; const linePrefixForIndex = (raw, index) => { const lineStart = raw.lastIndexOf("\n", index - 1) + 1; return raw.slice(lineStart, index).trim(); }; const extractImportNames = (raw) => { const imports = []; for (const match of raw.matchAll( /(?:import\s+(?:.+?\s+from\s+)?|require\(\s*)["'`]([^"'`]+)["'`]/gu, )) { if (linePrefixForIndex(raw, match.index) !== "") { continue; } imports.push({ index: match.index, names: [match[1]] }); } for (const match of raw.matchAll( /from\s+([A-Za-z0-9_.-]+)\s+import\b|import\s+([A-Za-z0-9_.,\s-]+)/gu, )) { if (linePrefixForIndex(raw, match.index) !== "") { continue; } const modules = match[1] ? [match[1]] : String(match[2] || "") .split(",") .map((entry) => entry.trim().split(/\s+as\s+/u)[0]) .filter(Boolean); if (modules.length) { imports.push({ index: match.index, names: modules }); } } return imports; }; const createFileSignals = () => ({ fileRef: undefined, frameworks: new Set(), modelRefs: new Set(), providers: new Set(), runtimes: new Set(), serviceRefs: new Set(), }); const relativeOccurrenceLocation = (discoveryPath, filePath, lineNumber) => { const relativePath = relative(discoveryPath, filePath) || basename(filePath); return lineNumber ? `${relativePath}#L${lineNumber}` : relativePath; }; const appendOccurrence = (target, location) => { target.occurrences = target.occurrences || []; if (!target.occurrences.some((entry) => entry.location === location)) { target.occurrences.push({ location }); } }; const mergePedigreeVariants = (pedigree, variants = []) => { const normalizedVariants = normalizeDetectedVariants(variants); if (!pedigree && !normalizedVariants.length) { return pedigree; } const nextPedigree = pedigree ? { ...pedigree } : {}; const notes = []; if (nextPedigree.notes) { notes.push(String(nextPedigree.notes)); } if (normalizedVariants.length) { notes.push(`Detected variants: ${normalizedVariants.join(", ")}`); } if (notes.length) { nextPedigree.notes = [...new Set(notes)].join("; "); } return Object.keys(nextPedigree).length ? nextPedigree : undefined; }; const lineNumberForIndex = (text, index) => text.slice(0, index).split("\n").length; const isLocalAiHostname = (hostname) => { const normalized = String(hostname || "").toLowerCase(); return ( normalized === "localhost" || normalized === "127.0.0.1" || normalized === "::1" || normalized.endsWith(".localhost") ); }; const hostProviderFromValue = (urlValue) => { try { const parsed = new URL(urlValue); return ( HOST_PROVIDER_PATTERNS.find(([, pattern]) => pattern.test(parsed.hostname), )?.[0] || undefined ); } catch { return undefined; } }; const modelFamilyFromName = (modelName) => { const normalized = String(modelName || "").toLowerCase(); if (!normalized) { return undefined; } if (normalized.includes("claude")) { return "claude"; } if (normalized.includes("gpt") || /^o[13](?:$|[-:])/u.test(normalized)) { return "gpt"; } if (normalized.includes("gemini")) { return "gemini"; } if (normalized.includes("llama")) { return "llama"; } if (normalized.includes("mistral")) { return "mistral"; } if (normalized.includes("command")) { return "command"; } if (normalized.includes("deepseek")) { return "deepseek"; } if (normalized.includes("qwen")) { return "qwen"; } return normalized.split(/[/:,\-]/u)[0] || undefined; }; const providerFromModelName = (modelName) => { const normalized = String(modelName || "").toLowerCase(); if (normalized.includes("claude")) { return "anthropic"; } if (normalized.includes("gpt") || /^o[13](?:$|[-:])/u.test(normalized)) { return "openai"; } if (normalized.includes("gemini")) { return "google"; } if (normalized.includes("mistral")) { return "mistral"; } if (normalized.includes("deepseek")) { return "deepseek"; } if (normalized.includes("llama")) { return "meta"; } return undefined; }; const parseJsonObject = (filePath) => { try { return JSON.parse(readFileSync(filePath, "utf-8")); } catch { return undefined; } }; const stableAiBomRef = (assetType, provider, identifier) => `cdxgen:ai:${assetType}:${provider}:${String(identifier || "").replaceAll(/[^a-zA-Z0-9._:-]+/gu, "-")}`; // ----------------------------------------------------------------------------- // Section: Hugging Face pedigree, dataset, and model-card helpers // ----------------------------------------------------------------------------- const providerEntityForName = (providerName) => providerName ? { name: String(providerName) } : undefined; const legacyQuantizationValueFromFilename = (fileName) => { const normalizedFileName = String(fileName || "").trim(); if (!normalizedFileName.toLowerCase().endsWith(".gguf")) { return undefined; } const segments = basename(normalizedFileName, ".gguf").split("."); return segments.length > 1 ? segments.at(-1) : undefined; }; const quantizationValueFromFilename = (fileName) => { const parsedFileName = parseGgufFilename(fileName); if (parsedFileName?.encoding) { return parsedFileName.encoding; } return legacyQuantizationValueFromFilename(fileName); }; const createExternalReference = (type, url, comment) => { const sanitizedUrl = sanitizeBomUrl(url); if (!sanitizedUrl) { return undefined; } const externalReference = { type, url: sanitizedUrl, }; if (comment) { externalReference.comment = comment; } return externalReference; }; const uniqueExternalReferences = (references) => [ ...new Map( references.filter(Boolean).map((ref) => [`${ref.type}:${ref.url}`, ref]), ).values(), ]; const createGgufExternalReferences = (metadata = {}) => uniqueExternalReferences([ createExternalReference("website", metadata["general.url"]), createExternalReference("vcs", metadata["general.repo_url"]), createExternalReference( "website", metadata["general.source.url"], "GGUF source metadata", ), createExternalReference( "vcs", metadata["general.source.repo_url"], "GGUF source repository", ), createExternalReference("license", metadata["general.license.link"]), createExternalReference( "citation", metadata["general.doi"] ? `https://doi.org/${metadata["general.doi"]}` : undefined, ), createExternalReference( "citation", metadata["general.source.doi"] ? `https://doi.org/${metadata["general.source.doi"]}` : undefined, "GGUF source DOI", ), ]); const createGgufBaseModelReference = (metadata, index) => { const baseKey = `general.base_model.${index}`; const repoUrl = metadata[`${baseKey}.repo_url`]; const name = metadata[`${baseKey}.name`]; const organization = metadata[`${baseKey}.organization`]; const version = metadata[`${baseKey}.version`]; const referenceUrl = metadata[`${baseKey}.url`]; const huggingFaceReference = normalizeHuggingFaceReference(repoUrl); const externalReferences = uniqueExternalReferences([ createExternalReference("website", referenceUrl), createExternalReference("vcs", repoUrl), createExternalReference( "citation", metadata[`${baseKey}.doi`] ? `https://doi.org/${metadata[`${baseKey}.doi`]}` : undefined, ), ]); if (huggingFaceReference?.assetType === "model") { return { "bom-ref": toHuggingFacePurl(huggingFaceReference.repoId), type: "machine-learning-model", group: huggingFaceReference.repoId.split("/")[0], name: huggingFaceReference.repoId.split("/")[1], purl: toHuggingFacePurl(huggingFaceReference.repoId), version, externalReferences, }; } if (!name && !repoUrl) { return undefined; } const referenceName = name || repoUrl || `gguf-base-model-${index}`; return { "bom-ref": stableAiBomRef( "model", organization || "gguf-base-model", referenceName, ), type: "machine-learning-model", group: organization, name: referenceName, version, externalReferences, }; }; const createGgufPedigree = (metadata = {}) => { const baseModelIndexes = new Set(); const baseModelCount = Number(metadata["general.base_model.count"]); if (Number.isInteger(baseModelCount) && baseModelCount > 0) { for (let index = 0; index < baseModelCount; index++) { baseModelIndexes.add(index); } } for (const key of Object.keys(metadata)) { const match = /^general\.base_model\.(\d+)\./u.exec(key); if (match) { baseModelIndexes.add(Number.parseInt(match[1], 10)); } } const ancestors = Array.from(baseModelIndexes) .sort((left, right) => left - right) .map((index) => createGgufBaseModelReference(metadata, index)) .filter(Boolean); if (!ancestors.length) { return undefined; } return { ancestors }; }; const ggufStringArray = (value) => Array.isArray(value) ? value.map((entry) => String(entry || "").trim()).filter(Boolean) : []; const normalizeGgufAlignment = (value) => { const numericValue = typeof value === "string" ? (() => { const normalizedValue = value.trim(); if (!/^\d+$/.test(normalizedValue)) { return Number.NaN; } return Number(normalizedValue); })() : Number(value); if (!Number.isInteger(numericValue)) { return undefined; } if (numericValue < GGUF_MIN_ALIGNMENT || numericValue > GGUF_MAX_ALIGNMENT) { return undefined; } return numericValue; }; const ggufTokenizerSignals = (metadata = {}) => ({ addedTokenCount: ggufStringArray(metadata["tokenizer.ggml.added_tokens"]) .length, bosTokenId: metadata["tokenizer.ggml.bos_token_id"], chatTemplateDetected: typeof metadata["tokenizer.chat_template"] === "string" && metadata["tokenizer.chat_template"].trim().length > 0, chatTemplateLength: typeof metadata["tokenizer.chat_template"] === "string" ? metadata["tokenizer.chat_template"].length : undefined, eosTokenId: metadata["tokenizer.ggml.eos_token_id"], huggingFaceTokenizer: typeof metadata["tokenizer.huggingface.json"] === "string" && metadata["tokenizer.huggingface.json"].trim().length > 0, mergeCount: ggufStringArray(metadata["tokenizer.ggml.merges"]).length, paddingTokenId: metadata["tokenizer.ggml.padding_token_id"], scoreCount: Array.isArray(metadata["tokenizer.ggml.scores"]) ? metadata["tokenizer.ggml.scores"].length : undefined, separatorTokenId: metadata["tokenizer.ggml.separator_token_id"], tokenCount: ggufStringArray(metadata["tokenizer.ggml.tokens"]).length, tokenizerModel: metadata["tokenizer.ggml.model"], tokenTypeCount: Array.isArray(metadata["tokenizer.ggml.token_type"]) ? metadata["tokenizer.ggml.token_type"].length : undefined, unknownTokenId: metadata["tokenizer.ggml.unknown_token_id"], }); const inferGgufModelTask = (metadata, parsedFileName) => { const tags = ggufStringArray(metadata["general.tags"]).map((tag) => tag.toLowerCase(), ); const fineTune = String(metadata["general.finetune"] || "").toLowerCase(); if ( tags.includes("text-generation") || fineTune.includes("chat") || fineTune.includes("instruct") || fineTune.includes("coding") || fineTune.includes("code") || ggufTokenizerSignals(metadata).chatTemplateDetected ) { return "text-generation"; } if ( tags.includes("embedding") || tags.includes("embeddings") || String(parsedFileName?.baseName || "") .toLowerCase() .includes("embed") ) { return "feature-extraction"; } return undefined; }; const createInlineDatasetReference = (datasetValue) => { const normalizedDatasetValue = String(datasetValue || "").trim(); if (!normalizedDatasetValue) { return undefined; } const sanitizedUrl = sanitizeBomUrl(normalizedDatasetValue); if (sanitizedUrl) { const huggingFaceReference = normalizeHuggingFaceReference(sanitizedUrl); return { type: "dataset", name: huggingFaceReference?.repoId || sanitizedUrl, contents: { url: sanitizedUrl, }, }; } return { type: "dataset", name: normalizedDatasetValue, }; }; const dedupeModelCardDatasets = (datasets) => [ ...new Map( datasets .filter(Boolean) .map((dataset) => [dataset.contents?.url || dataset.name, dataset]), ).values(), ]; const createGgufModelCard = (metadata, parsedFileName) => { const architectureFamily = metadata["general.architecture"]; const modelArchitecture = metadata["general.basename"]; const datasets = dedupeModelCardDatasets( ggufStringArray(metadata["general.datasets"]).map( createInlineDatasetReference, ), ); const task = inferGgufModelTask(metadata, parsedFileName); const tokenizerSignals = ggufTokenizerSignals(metadata); const modelCard = { modelParameters: {}, }; if (architectureFamily) { modelCard.modelParameters.architectureFamily = architectureFamily; } if (modelArchitecture) { modelCard.modelParameters.modelArchitecture = modelArchitecture; } if (task) { modelCard.modelParameters.task = task; } if (datasets.length) { modelCard.modelParameters.datasets = datasets; } if ( tokenizerSignals.tokenizerModel || tokenizerSignals.chatTemplateDetected ) { modelCard.modelParameters.inputs = [{ format: "text" }]; modelCard.modelParameters.outputs = [{ format: "text" }]; } return Object.values(modelCard.modelParameters).some(Boolean) ? sanitizeStructuredValueForBom(modelCard) : undefined; }; const contextWindowFromGgufMetadata = (metadata = {}) => { const architecture = String(metadata["general.architecture"] || "").trim(); if ( architecture && metadata[`${architecture}.context_length`] !== undefined ) { return metadata[`${architecture}.context_length`]; } if (metadata["general.context_length"] !== undefined) { return metadata["general.context_length"]; } for (const [key, value] of Object.entries(metadata)) { if (key.endsWith(".context_length")) { return value; } } return undefined; }; const ggufModelIdFromFilename = (parsedFileName, filePath) => { const segments = []; if (parsedFileName?.sidecar) { segments.push(parsedFileName.sidecar); } if (parsedFileName?.baseName) { segments.push(parsedFileName.baseName); } if (parsedFileName?.sizeLabel) { segments.push(parsedFileName.sizeLabel); } if (parsedFileName?.fineTune) { segments.push(parsedFileName.fineTune); } if (parsedFileName?.type) { segments.push(parsedFileName.type); } if (segments.length) { return segments.join("-"); } return basename(filePath, extname(filePath)); }; const applyGgufProperties = ( subject, metadata, parsedFileName, includeArtifactDetails, ) => { const tokenizerSignals = ggufTokenizerSignals(metadata); addUniqueProperty( subject.properties, "cdx:gguf:basename", metadata["general.basename"] || parsedFileName?.baseName, ); addUniqueProperty( subject.properties, "cdx:gguf:sizeLabel", metadata["general.size_label"] || parsedFileName?.sizeLabel, ); addUniqueProperty( subject.properties, "cdx:gguf:finetune", metadata["general.finetune"] || parsedFileName?.fineTune, ); addUniqueProperty( subject.properties, "cdx:gguf:quantizationVersion", metadata["general.quantization_version"], ); addUniqueProperty( subject.properties, "cdx:gguf:quantizedBy", metadata["general.quantized_by"], ); addUniqueProperty( subject.properties, "cdx:gguf:sidecar", parsedFileName?.sidecar, ); addUniqueProperty(subject.properties, "cdx:gguf:type", parsedFileName?.type); addUniqueProperty( subject.properties, "cdx:gguf:tokenizerModel", tokenizerSignals.tokenizerModel, ); addUniqueProperty( subject.properties, "cdx:gguf:tokenizerTokenCount", tokenizerSignals.tokenCount, ); addUniqueProperty( subject.properties, "cdx:gguf:tokenizerScoreCount", tokenizerSignals.scoreCount, ); addUniqueProperty( subject.properties, "cdx:gguf:tokenizerTokenTypeCount", tokenizerSignals.tokenTypeCount, ); addUniqueProperty( subject.properties, "cdx:gguf:tokenizerMergeCount", tokenizerSignals.mergeCount, ); addUniqueProperty( subject.properties, "cdx:gguf:tokenizerAddedTokenCount", tokenizerSignals.addedTokenCount, ); addUniqueProperty( subject.properties, "cdx:gguf:huggingFaceTokenizer", tokenizerSignals.huggingFaceTokenizer ? "true" : undefined, ); addUniqueProperty( subject.properties, "cdx:gguf:chatTemplateDetected", tokenizerSignals.chatTemplateDetected ? "true" : undefined, ); addUniqueProperty( subject.properties, "cdx:gguf:chatTemplateLength", tokenizerSignals.chatTemplateLength, ); addUniqueProperty( subject.properties, "cdx:gguf:bosTokenId", tokenizerSignals.bosTokenId, ); addUniqueProperty( subject.properties, "cdx:gguf:eosTokenId", tokenizerSignals.eosTokenId, ); addUniqueProperty( subject.properties, "cdx:gguf:unknownTokenId", tokenizerSignals.unknownTokenId, ); addUniqueProperty( subject.properties, "cdx:gguf:separatorTokenId", tokenizerSignals.separatorTokenId, ); addUniqueProperty( subject.properties, "cdx:gguf:paddingTokenId", tokenizerSignals.paddingTokenId, ); for (const language of ggufStringArray(metadata["general.languages"])) { addUniqueProperty(subject.properties, "cdx:gguf:language", language); } addUniqueProperty( subject.properties, "cdx:gguf:datasetCount", ggufStringArray(metadata["general.datasets"]).length || undefined, ); if (!includeArtifactDetails) { return; } addUniqueProperty( subject.properties, "cdx:gguf:formatVersion", metadata["gguf.version"], ); addUniqueProperty( subject.properties, "cdx:gguf:tensorCount", metadata["gguf.tensorCount"], ); addUniqueProperty( subject.properties, "cdx:gguf:metadataCount", metadata["gguf.metadataCount"], ); addUniqueProperty( subject.properties, "cdx:gguf:alignment", normalizeGgufAlignment(metadata["general.alignment"]), ); addUniqueProperty( subject.properties, "cdx:gguf:shard", parsedFileName?.shard, ); addUniqueProperty( subject.properties, "cdx:gguf:shardIndex", parsedFileName?.shardIndex, ); addUniqueProperty( subject.properties, "cdx:gguf:shardCount", parsedFileName?.shardCount, ); }; const extractHuggingFaceArtifactDetails = (sourceUrl) => { if (!sourceUrl) { return {}; } try { const parsed = new URL(sourceUrl); const fileName = basename(parsed.pathname); const artifactFormat = fileName.toLowerCase().endsWith(".gguf") ? "gguf" : undefined; const parsedGgufFileName = parseGgufFilename(fileName); return { artifactFormat, quantization: parsedGgufFileName?.encoding || quantizationValueFromFilename(fileName), sourceUrl, }; } catch { return {}; } }; const dependencyListFromMap = (dependencyMap) => Array.from(dependencyMap.entries()).map(([ref, dependsOn]) => ({ ref, dependsOn: Array.from(dependsOn).sort(), })); const buildAiInventoryResult = ( componentsByKey, servicesByKey, dependencyMap, ) => { const components = Array.from(componentsByKey.values()).map((component) => { applyOccurrenceEvidence(component, component?.evidence?.occurrences || []); return component; }); const services = Array.from(servicesByKey.values()).map((service) => { syncServiceProperties(service); return serializeAiService(service); }); const dependencies = dependencyListFromMap(dependencyMap); for (const service of services) { const source = servicesByKey.get(`${service.group}:${service.name}`); const dependsOn = new Set([ ...Array.from(source?.modelRefs || []), ...Array.from(source?.fileRefs || []), ]); if (dependsOn.size) { dependencies.push({ ref: service["bom-ref"], dependsOn: Array.from(dependsOn).sort(), }); } } return { components, dependencies, services }; }; const classifyImport = (importName) => AI_PACKAGE_REGISTRY.find((entry) => entry.pattern.test(importName)); const getSourceFiles = (discoveryPath, patterns, options) => { const files = new Set(); const directFileMatches = getDirectDiscoveryFileMatches( discoveryPath, patterns, ); for (const filePath of directFileMatches) { const normalizedFilePath = String(filePath || ""); if ( normalizedFilePath.includes("/node_modules/") || normalizedFilePath.includes("\\node_modules\\") || IGNORE_SOURCE_FILE_PATTERN.test(normalizedFilePath) ) { continue; } files.add(filePath); } if (isDirectDiscoveryFile(discoveryPath)) { return Array.from(files).sort(); } for (const pattern of patterns) { for (const filePath of getAllFiles(discoveryPath, pattern, options) || []) { const normalizedFilePath = String(filePath || ""); if ( normalizedFilePath.includes("/node_modules/") || normalizedFilePath.includes("\\node_modules\\") || IGNORE_SOURCE_FILE_PATTERN.test(normalizedFilePath) ) { continue; } files.add(filePath); } } return Array.from(files).sort(); }; const getMatchingFiles = (discoveryPath, patterns, options) => { const files = new Set(); for (const filePath of getDirectDiscoveryFileMatches( discoveryPath, patterns, )) { files.add(filePath); } if (isDirectDiscoveryFile(discoveryPath)) { return Array.from(files).sort(); } for (const pattern of patterns) { for (const filePath of getAllFiles(discoveryPath, pattern, options) || []) { files.add(filePath); } } return Array.from(files).sort(); }; const isDirectDiscoveryFile = (discoveryPath) => { try { return statSync(discoveryPath).isFile(); } catch { return false; } }; const getDirectDiscoveryFileMatches = (discoveryPath, patterns) => { if (!isDirectDiscoveryFile(discoveryPath)) { return []; } const normalizedPath = String(discoveryPath || ""); const filename = basename(normalizedPath); const extension = extname(normalizedPath).toLowerCase(); if ( patterns.includes("Modelfile") && (filename === "Modelfile" || filename.startsWith("Modelfile.")) ) { return [normalizedPath]; } if (patterns.includes("**/*.gguf") && extension === ".gguf") { return [normalizedPath]; } if ( patterns.some((pattern) => pattern.includes("*.{js,jsx,cjs,mjs,ts,tsx,mts,cts,vue,svelte}"), ) && [ ".js", ".jsx", ".cjs", ".mjs", ".ts", ".tsx", ".mts", ".cts", ".vue", ".svelte", ].includes(extension) ) { return [normalizedPath]; } if ( patterns.some((pattern) => pattern.includes("*.{py,pyw}")) && [".py", ".pyw"].includes(extension) ) { return [normalizedPath]; } if (patterns.includes("**/*.ipynb") && extension === ".ipynb") { return [normalizedPath]; } return []; }; const defaultServiceNameForProvider = (provider) => AI_PACKAGE_REGISTRY.find( (entry) => entry.provider === provider && entry.serviceName, )?.serviceName || `${provider || "ai"}-service`; const fileBomRef = (discoveryPath, filePath) => `urn:file:ai:${(relative(discoveryPath, filePath) || basename(filePath)).replaceAll(/[^a-zA-Z0-9._:/-]+/gu, "-")}`; const createAiFileComponent = ( componentsByKey, discoveryPath, filePath, kind, tags = [], ) => { const key = `file:${relative(discoveryPath, filePath) || basename(filePath)}`; if (!componentsByKey.has(key)) { const location = relativeOccurrenceLocation(discoveryPath, filePath); componentsByKey.set(key, { "bom-ref": fileBomRef(discoveryPath, filePath), type: "file", name: basename(filePath), evidence: { occurrences: [{ location }] }, properties: [ { name: "cdx:file:kind", value: kind }, { name: "cdx:ai:kind", value: kind }, { name: "cdx:ai:source", value: "source-code-analysis" }, ], tags: ["ai", kind, ...tags], }); } return componentsByKey.get(key); }; const normalizeComponentIdentity = ({ assetType, modelId, provider, version, }) => { const normalizedAssetType = ["dataset", "space"].includes(assetType) ? assetType : "model"; const normalizedModelId = String(modelId || "").trim(); const normalizedProvider = String(provider || "ai").trim(); const typedModelId = normalizedAssetType === "dataset" ? `datasets/${normalizedModelId.replace(/^datasets\//u, "")}` : normalizedAssetType === "space" ? `spaces/${normalizedModelId.replace(/^spaces\//u, "")}` : normalizedModelId; const repoId = normalizeHuggingFaceReference(typedModelId); if ( repoId?.assetType === normalizedAssetType && repoId?.repoId?.includes("/") ) { const [group, name] = repoId.repoId.split("/"); const purl = toHuggingFacePurl( repoId.repoId, repoId.version || version, repositoryUrlForHuggingFaceAssetType(normalizedAssetType), ); return { bomRef: purl, group, name, purl, }; } return { bomRef: stableAiBomRef( normalizedAssetType, normalizedProvider, normalizedModelId, ), group: normalizedProvider, name: normalizedModelId, }; }; const ensureModelComponent = (componentsByKey, key, seed) => { if (!componentsByKey.has(key)) { const identity = normalizeComponentIdentity(seed); const type = seed.assetType === "dataset" ? "data" : seed.assetType === "space" ? "application" : "machine-learning-model"; componentsByKey.set(key, { "bom-ref": identity.bomRef, type, group: identity.group, name: identity.name, purl: identity.purl, description: seed.description, version: seed.version, licenses: seed.licenses, externalReferences: seed.externalReferences || [], evidence: { occurrences: [] }, modelCard: seed.modelCard ? sanitizeStructuredValueForBom(seed.modelCard) : undefined, pedigree: seed.pedigree, properties: [], tags: ["ai"], }); } return componentsByKey.get(key); }; const ensureService = (servicesByKey, provider, serviceName) => { const normalizedProvider = provider || "ai"; const resolvedServiceName = serviceName || defaultServiceNameForProvider(normalizedProvider); const key = `${normalizedProvider}:${resolvedServiceName}`; if (!servicesByKey.has(key)) { servicesByKey.set(key, { "bom-ref": `urn:service:ai:${normalizedProvider}:${String( serviceName || `${normalizedProvider}-service`, ).replaceAll(/[^a-zA-Z0-9._:-]+/gu, "-")}`, group: normalizedProvider, name: resolvedServiceName, provider: providerEntityForName(normalizedProvider), version: "observed", endpoints: new Set(), modelRefs: new Set(), modelIds: new Set(), modelFamilies: new Set(), frameworks: new Set(), runtimes: new Set(), sdkImports: new Set(), fileRefs: new Set(), occurrences: [], properties: [], tags: new Set(["ai"]), }); } return servicesByKey.get(key); }; const syncComponentProperties = (component, data = {}) => { if (data.provider) { addUniqueProperty(component.properties, "cdx:ai:provider", data.provider); } if (data.kind) { addUniqueProperty(component.properties, "cdx:ai:kind", data.kind); } if (data.modelFamily) { addUniqueProperty( component.properties, "cdx:ai:modelFamily", data.modelFamily, ); } if (data.artifactFormat) { addUniqueProperty( component.properties, "cdx:ai:artifactFormat", data.artifactFormat, ); } if (data.runtime) { addUniqueProperty(component.properties, "cdx:ai:runtime", data.runtime); } if (data.quantization) { addUniqueProperty( component.properties, "cdx:ai:quantization", data.quantization, ); } for (const variant of data.variants || []) { addUniqueProperty(component.properties, "cdx:ai:variant", variant); } if (data.parameterCount !== undefined) { addUniqueProperty( component.properties, "cdx:ai:parameterCount", data.parameterCount, ); } if (data.contextWindow !== undefined) { addUniqueProperty( component.properties, "cdx:ai:contextWindow", data.contextWindow, ); } if (data.modality) { addUniqueProperty(component.properties, "cdx:ai:modality", data.modality); } if (data.source) { addUniqueProperty(component.properties, "cdx:ai:source", data.source); } if (data.confidence) { addUniqueProperty( component.properties, "cdx:ai:confidence", data.confidence, ); } if (data.reviewNeeded) { addUniqueProperty(component.properties, "cdx:ai:reviewNeeded", "true"); } }; const aiModelVariantsFromSeed = (seed = {}) => detectAiModelVariants({ description: seed.description, metadata: [ seed.artifactFormat, seed.modelFamily, seed.runtime, seed.source, seed.modelCard?.modelParameters?.task, ...(seed.modelCard?.modelParameters?.datasets || []).map( (dataset) => dataset?.name || dataset?.ref, ), ], modelName: [seed.modelId, seed.name], notes: [seed.pedigree?.notes], quantization: seed.quantization, relation: String(seed.pedigree?.notes || "") .match(/Hugging Face relation:\s*([^;]+)/u)?.[1] ?.trim() || seed.relation, tags: seed.tags, }); const syncServiceProperties = (service) => { service.properties = []; addUniqueProperty(service.properties, "cdx:ai:kind", "inference-service"); addUniqueProperty( service.properties, "cdx:ai:source", "source-code-analysis", ); const sortedFamilies = Array.from(service.modelFamilies).sort(); const sortedModelIds = Array.from(service.modelIds).sort(); const endpoints = Array.from(service.endpoints); const remoteEndpoints = []; const localEndpoints = []; for (const endpoint of endpoints) { try { const parsed = new URL(endpoint); if (isLocalAiHostname(parsed.hostname)) { localEndpoints.push(endpoint); } else { remoteEndpoints.push(endpoint); } } catch { // Ignore malformed endpoints captured from source strings. } } for (const modelId of sortedModelIds) { addUniqueProperty(service.properties, "cdx:ai:modelId", modelId); } for (const modelFamily of sortedFamilies) { addUniqueProperty(service.properties, "cdx:ai:modelFamily", modelFamily); } addUniqueProperty( service.properties, "cdx:ai:modelCount", String(sortedModelIds.length), ); addUniqueProperty( service.properties, "cdx:ai:modelSelection", sortedModelIds.length ? "explicit" : "implicit", ); addUniqueProperty( service.properties, "cdx:ai:deployment", remoteEndpoints.length ? "remote" : localEndpoints.length ? "local" : "implicit", ); addUniqueProperty( service.properties, "cdx:ai:transportSecurity", remoteEndpoints.some((endpoint) => endpoint.startsWith("http://")) ? "insecure-http" : remoteEndpoints.length ? "https" : localEndpoints.length ? "local-only" : "unknown", ); if (service.runtimes.size) { addUniqueProperty( service.properties, "cdx:ai:runtime", Array.from(service.runtimes).sort().join(","), ); } addUniqueProperty( service.properties, "cdx:ai:confidence", sortedModelIds.length || service.endpoints.size ? "high" : "medium", ); if ( service.group === "ollama" || Array.from(service.endpoints).some((endpoint) => endpoint.includes("localhost"), ) ) { addUniqueProperty(service.properties, "cdx:ai:reviewNeeded", "true"); } }; const appendImportSignals = ( importName, occurrence, fileSignals, servicesByKey, ) => { const classification = classifyImport(importName); if (!classification) { return; } if (classification.framework) { fileSignals.frameworks.add(classification.framework); } if (classification.provider) { fileSignals.providers.add(classification.provider); const service = ensureService( servicesByKey, classification.provider, classification.serviceName, ); service.sdkImports.add(importName); service.tags.add(classification.provider); appendOccurrence( service, occurrence.fileName ? `${occurrence.fileName}${occurrence.lineNumber ? `#L${occurrence.lineNumber}` : ""}` : importName, ); fileSignals.serviceRefs.add(service["bom-ref"]); if (fileSignals.fileRef) { service.fileRefs.add(fileSignals.fileRef); } } if (classification.runtime) { fileSignals.runtimes.add(classification.runtime); } }; const scanUrlMatches = ( raw, filePath, discoveryPath, fileSignals, servicesByKey, ) => { for (const match of raw.matchAll(/https?:\/\/[^\s"'`)<]+/gu)) { const urlValue = sanitizeBomUrl(match[0]); if (!urlValue) { continue; } const provider = hostProviderFromValue(urlValue); if (!provider) { continue; } fileSignals.providers.add(provider); const service = ensureService(servicesByKey, provider, undefined); service.endpoints.add(urlValue); if (fileSignals.fileRef) { service.fileRefs.add(fileSignals.fileRef); } fileSignals.serviceRefs.add(service["bom-ref"]); appendOccurrence( service, relativeOccurrenceLocation( discoveryPath, filePath, lineNumberForIndex(raw, match.index), ), ); } }; const extractModelAssignments = (raw) => { const values = []; const patterns = [ /\bmodel(?:Id|Name)?\s*[:=]\s*["'`]([^"'`\n]{2,160})["'`]/gu, /\b(?:model|model_name|model_id)\s*=\s*["'`]([^"'`\n]{2,160})["'`]/gu, /\b(?:model|model_name|model_id)\s*:\s*([A-Za-z0-9._:/-]{2,160})\b/gu, /pipeline\s*\(\s*["'`][^"'`]+["'`]\s*,\s*["'`]([^"'`\n]{2,160})["'`]/gu, /InferenceClient\s*\(\s*["'`]([^"'`\n]{2,160})["'`]/gu, /\b(?:from_pretrained|AutoModel(?:For\w+)?)\s*\(\s*["'`]([^"'`\n]{2,160})["'`]/gu, ]; for (const pattern of patterns) { for (const match of raw.matchAll(pattern)) { values.push({ index: match.index, value: match[1] }); } } return values.filter((entry) => { const normalized = String(entry.value || "").trim(); return ( normalized && !normalized.startsWith("http") && normalized !== "auto" ); }); }; const extractHuggingFaceReferences = (raw) => { const refs = []; const patterns = [ /https?:\/\/huggingface\.co\/(datasets\/|spaces\/)?([^/"'`?#\s]+\/[^/"'`?#\s]+)(?:\/resolve\/[^"'`?#\s]+\/([^"'`?#\s]+))?/gu, /\b(?:repo_?id|model)\s*[:=]\s*["'`]([^"'`\n]+\/[^"'`\n]+)["'`]/gu, ]; for (const pattern of patterns) { for (const match of raw.matchAll(pattern)) { refs.push({ index: match.index, artifact: match[3], sourceUrl: match[0]?.startsWith("http") ? match[0] : undefined, value: match[2] !== undefined ? `${match[1] || ""}${match[2]}` : match[1], }); } } return refs .map((entry) => { const reference = normalizeHuggingFaceReference(entry.value); return { artifact: entry.artifact, index: entry.index, reference, sourceUrl: entry.sourceUrl, }; }) .filter((entry) => entry.reference?.repoId); }; const applyOccurrenceEvidence = (subject, occurrences) => { if (!occurrences?.length) { return; } subject.evidence = subject.evidence || {}; subject.evidence.occurrences = subject.evidence.occurrences || []; for (const occurrence of occurrences) { if ( !subject.evidence.occurrences.some( (entry) => entry.location === occurrence.location, ) ) { subject.evidence.occurrences.push(occurrence); } } }; const createModelComponent = (componentsByKey, seed, occurrenceLocation) => { const variants = normalizeDetectedVariants([ ...(seed.variants || []), ...aiModelVariantsFromSeed(seed), ]); const normalizedAssetType = ["dataset", "space"].includes(seed.assetType) ? seed.assetType : "model"; const key = `${normalizedAssetType}:${seed.provider || "ai"}:${seed.modelId}`; const component = ensureModelComponent(componentsByKey, key, seed); syncComponentProperties(component, { artifactFormat: seed.artifactFormat, confidence: seed.confidence, contextWindow: seed.contextWindow, kind: normalizedAssetType, modality: seed.modality, modelFamily: seed.modelFamily, parameterCount: seed.parameterCount, provider: seed.provider, quantization: seed.quantization, reviewNeeded: seed.reviewNeeded, runtime: seed.runtime, source: seed.source, }); if (occurrenceLocation) { appendOccurrence(component.evidence, occurrenceLocation); } if (seed.tags?.length) { component.tags = [ ...new Set([...(component.tags || []), ...seed.tags, ...variants]), ]; } if (seed.externalReferences?.length) { component.externalReferences = [ ...new Map( [ ...(component.externalReferences || []), ...seed.externalReferences, ].map((reference) => [`${reference.type}:${reference.url}`, reference]), ).values(), ]; } if (seed.licenses?.length && !component.licenses?.length) { component.licenses = seed.licenses; } if (!component.modelCard && seed.modelCard) { component.modelCard = sanitizeStructuredValueForBom(seed.modelCard); } const mergedPedigree = mergePedigreeVariants( component.pedigree || seed.pedigree, variants, ); if (mergedPedigree) { component.pedigree = mergedPedigree; } if (!component.description && seed.description) { component.description = seed.description; } if (!component.version && seed.version) { component.version = seed.version; } return component; }; // ----------------------------------------------------------------------------- // Section: artifact collectors (Modelfile and GGUF) // ----------------------------------------------------------------------------- const attachModelToServices = (servicesByKey, modelComponent, signals) => { const providers = signals.providers.size ? Array.from(signals.providers) : [modelComponent.group].filter(Boolean); for (const provider of providers) { const service = ensureService(servicesByKey, provider, undefined); service.modelRefs.add(modelComponent["bom-ref"]); service.modelIds.add(modelComponent.name); const family = modelFamilyFromName(modelComponent.name); if (family) { service.modelFamilies.add(family); } for (const framework of signals.frameworks) { service.frameworks.add(framework); service.tags.add(framework); } for (const runtime of signals.runtimes) { service.runtimes.add(runtime); } if (signals.fileRef) { service.fileRefs.add(signals.fileRef); } signals.serviceRefs.add(service["bom-ref"]); } }; const collectTextAiInventory = (discoveryPath, options = {}, config = {}) => { const componentsByKey = new Map(); const servicesByKey = new Map(); const dependencyMap = new Map(); const sourceFiles = getSourceFiles( discoveryPath, config.patterns || [], options, ); for (const filePath of sourceFiles) { let raw = ""; try { raw = readFileSync(filePath, "utf-8"); } catch { continue; } const fileKey = relative(discoveryPath, filePath) || basename(filePath); const fileSignals = createFileSignals(); if (config.fileKind) { fileSignals.fileRef = createAiFileComponent( componentsByKey, discoveryPath, filePath, config.fileKind, config.tags, )["bom-ref"]; } scanUrlMatches(raw, filePath, discoveryPath, fileSignals, servicesByKey); for (const match of extractImportNames(raw)) { for (const importName of match.names) { appendImportSignals( importName, { fileName: fileKey, lineNumber: lineNumberForIndex(raw, match.index), }, fileSignals, servicesByKey, ); } } for (const match of extractModelAssignments(raw)) { const modelId = String(match.value || "").trim(); const provider = Array.from(fileSignals.providers)[0] || providerFromModelName(modelId); const component = createModelComponent( componentsByKey, { assetType: "model", confidence: "medium", kind: "model", modelFamily: modelFamilyFromName(modelId), modelId, provider, reviewNeeded: provider === "ollama", runtime: Array.from(fileSignals.runtimes)[0], source: "source-code-analysis", tags: [...fileSignals.frameworks], }, relativeOccurrenceLocation( discoveryPath, filePath, lineNumberForIndex(raw, match.index), ), ); fileSignals.modelRefs.add(component["bom-ref"]); attachModelToServices(servicesByKey, component, fileSignals); } for (const match of extractHuggingFaceReferences(raw)) { const occurrenceLocation = relativeOccurrenceLocation( discoveryPath, filePath, lineNumberForIndex(raw, match.index), ); const assetType = match.reference.assetType; const [group, name] = match.reference.repoId.split("/"); const artifactDetails = extractHuggingFac