@cyclonedx/cdxgen
Version:
Creates CycloneDX Software Bill of Materials (SBOM) from source or container image
467 lines (449 loc) • 13.9 kB
JavaScript
import { PackageURL } from "packageurl-js";
import {
sanitizeBomPropertyValue,
sanitizeBomUrl,
} from "./propertySanitizer.js";
const normalizeArray = (value) => {
if (Array.isArray(value)) {
return value;
}
if (value === undefined || value === null) {
return [];
}
return [value];
};
export const HF_BASE_URL = "https://huggingface.co";
export const HUGGING_FACE_ANCESTOR_RELATIONS = new Set([
"adapter",
"distilled",
"distillation",
"finetune",
"fine-tune",
"fine_tune",
"merge",
"merged",
"quantized",
]);
export const HUGGING_FACE_DATASET_REPOSITORY_URL = `${HF_BASE_URL}/datasets`;
export const HUGGING_FACE_SPACE_REPOSITORY_URL = `${HF_BASE_URL}/spaces`;
export function repositoryUrlForHuggingFaceAssetType(assetType) {
if (assetType === "dataset") {
return HUGGING_FACE_DATASET_REPOSITORY_URL;
}
if (assetType === "space") {
return HUGGING_FACE_SPACE_REPOSITORY_URL;
}
return HF_BASE_URL;
}
export function assetTypeFromHuggingFaceRepositoryUrl(repositoryUrl) {
const normalizedRepositoryUrl = String(repositoryUrl || "")
.trim()
.replace(/\/+$/u, "");
if (!normalizedRepositoryUrl || normalizedRepositoryUrl === HF_BASE_URL) {
return "model";
}
if (normalizedRepositoryUrl === HUGGING_FACE_DATASET_REPOSITORY_URL) {
return "dataset";
}
if (normalizedRepositoryUrl === HUGGING_FACE_SPACE_REPOSITORY_URL) {
return "space";
}
return "model";
}
/**
* Normalize a Hugging Face repository identifier to the canonical namespace/name form.
*
* @param {string} repoId Hugging Face repository id candidate
* @returns {string|undefined} normalized repository id
*/
export function sanitizeHuggingFaceRepoId(repoId) {
const trimmed = String(repoId || "").trim();
let start = 0;
let end = trimmed.length;
while (start < end && trimmed[start] === "/") {
start += 1;
}
while (end > start && trimmed[end - 1] === "/") {
end -= 1;
}
const normalized = trimmed.slice(start, end);
if (!/^[^\s/]+\/[^\s/]+$/u.test(normalized)) {
return undefined;
}
const segments = normalized.split("/");
if (
segments.some((segment) => {
try {
const decoded = decodeURIComponent(segment);
return (
!decoded ||
decoded === "." ||
decoded === ".." ||
decoded.includes("/") ||
decoded.includes("\\")
);
} catch {
return true;
}
})
) {
return undefined;
}
return normalized;
}
const encodePathSegment = (segment) => {
if (segment === ".") {
return "%2E";
}
if (segment === "..") {
return "%2E%2E";
}
return encodeURIComponent(segment);
};
/**
* Encode Hugging Face path segments while preserving path separators.
*
* @param {string} value path-like repository identifier
* @returns {string} encoded path segments
*/
export function encodeHuggingFacePathSegments(value) {
return String(value || "")
.split("/")
.filter(Boolean)
.map((segment) => encodePathSegment(segment))
.join("/");
}
/**
* Convert a Hugging Face asset reference to a canonical web path.
*
* @param {string} assetType asset type such as model, dataset, or space
* @param {string} repoId Hugging Face repository id
* @returns {string|undefined} canonical path under huggingface.co
*/
export function toHuggingFaceAssetPath(assetType, repoId) {
const normalizedRepoId = sanitizeHuggingFaceRepoId(repoId);
if (!normalizedRepoId) {
return undefined;
}
const encodedRepoId = encodeHuggingFacePathSegments(normalizedRepoId);
switch (assetType) {
case "dataset":
return `datasets/${encodedRepoId}`;
case "space":
return `spaces/${encodedRepoId}`;
default:
return encodedRepoId;
}
}
/**
* Convert a Hugging Face asset reference to a canonical web URL.
*
* @param {string} assetType asset type such as model, dataset, or space
* @param {string} repoId Hugging Face repository id
* @returns {string|undefined} canonical URL under huggingface.co
*/
export function toHuggingFaceAssetUrl(assetType, repoId) {
const assetPath = toHuggingFaceAssetPath(assetType, repoId);
return assetPath ? `${HF_BASE_URL}/${assetPath}` : undefined;
}
/**
* Convert a Hugging Face repo reference to a package URL.
*
* @param {string} repoId Hugging Face repository id
* @param {string} [version] optional revision or sha
* @param {string} [repositoryUrl] optional registry URL override
* @returns {string|undefined} normalized Hugging Face purl
*/
export function toHuggingFacePurl(repoId, version, repositoryUrl) {
const normalizedRepoId = sanitizeHuggingFaceRepoId(repoId);
if (!normalizedRepoId) {
return undefined;
}
const [namespace, name] = normalizedRepoId.split("/");
let normalizedVersion;
if (version) {
const trimmedVersion = String(version).trim();
try {
normalizedVersion = decodeURIComponent(trimmedVersion).toLowerCase();
} catch {
normalizedVersion = trimmedVersion.toLowerCase();
}
}
const sanitizedRepositoryUrl = sanitizeBomUrl(repositoryUrl);
const normalizedRepositoryUrl = sanitizedRepositoryUrl?.replace(/\/+$/u, "");
const qualifiers =
normalizedRepositoryUrl && normalizedRepositoryUrl !== HF_BASE_URL
? { repository_url: normalizedRepositoryUrl }
: undefined;
let purlString = new PackageURL(
"huggingface",
namespace,
name,
normalizedVersion,
qualifiers,
).toString();
if (qualifiers?.repository_url) {
purlString = purlString.replace(
/([?&]repository_url=)[^&]+/u,
`$1${encodeURIComponent(qualifiers.repository_url)}`,
);
}
return purlString;
}
/**
* Normalize a direct Hugging Face URL or purl into a repo reference.
*
* @param {string} value direct URL, API URL, or purl
* @returns {{ assetType: string, repoId: string, version?: string }|undefined} normalized reference
*/
export function normalizeHuggingFaceReference(value) {
if (!value) {
return undefined;
}
const normalizedValue = String(value || "").trim();
if (!normalizedValue) {
return undefined;
}
if (normalizedValue.startsWith("pkg:huggingface/")) {
try {
const purl = PackageURL.fromString(normalizedValue);
const repoId = sanitizeHuggingFaceRepoId(
`${purl.namespace}/${purl.name}`,
);
if (repoId) {
return {
assetType: assetTypeFromHuggingFaceRepositoryUrl(
purl.qualifiers?.repository_url,
),
repoId,
...(purl.version ? { version: purl.version } : {}),
};
}
} catch {
// Fall through to the remaining parsers.
}
}
const directTypedMatch = normalizedValue.match(
/^(models|datasets|spaces)\/([^/\s]+\/[^/\s]+)(?:\/revision\/([^/\s?#]+))?$/u,
);
if (directTypedMatch) {
return {
assetType:
directTypedMatch[1] === "datasets"
? "dataset"
: directTypedMatch[1] === "spaces"
? "space"
: "model",
repoId: sanitizeHuggingFaceRepoId(directTypedMatch[2]),
...(directTypedMatch[3]
? { version: decodeURIComponent(directTypedMatch[3]) }
: {}),
};
}
const looksLikeFilesystemPath =
/^(?:\/|\.{1,2}(?:\/|$)|~\/|[a-z]:[\\/])/iu.test(normalizedValue) ||
normalizedValue.includes("\\");
const directRepoId = looksLikeFilesystemPath
? undefined
: sanitizeHuggingFaceRepoId(normalizedValue);
if (directRepoId) {
return { assetType: "model", repoId: directRepoId };
}
try {
const parsed = new URL(normalizedValue);
if (parsed.hostname !== "huggingface.co") {
return undefined;
}
const pathSegments = parsed.pathname.split("/").filter(Boolean);
let assetType = "model";
if (pathSegments[0] === "api") {
if (pathSegments[1] === "datasets") {
assetType = "dataset";
} else if (pathSegments[1] === "spaces") {
assetType = "space";
}
pathSegments.splice(0, 2);
} else if (pathSegments[0] === "datasets") {
assetType = "dataset";
pathSegments.shift();
} else if (pathSegments[0] === "spaces") {
assetType = "space";
pathSegments.shift();
} else if (pathSegments[0] === "models") {
pathSegments.shift();
}
if (pathSegments.length < 2) {
return undefined;
}
const repoId = sanitizeHuggingFaceRepoId(
`${pathSegments[0]}/${pathSegments[1]}`,
);
if (!repoId) {
return undefined;
}
let version;
if (pathSegments[2] === "revision" && pathSegments[3]) {
version = decodeURIComponent(pathSegments[3]);
}
return {
assetType,
repoId,
...(version ? { version } : {}),
};
} catch {
return undefined;
}
}
/**
* Normalize a Hugging Face dataset descriptor into reusable fields.
*
* @param {object|string} dataset dataset reference from model-card metadata
* @param {{ urlSanitizer?: (url: string|undefined) => string|undefined }} [options={}] dataset normalization options
* @returns {{
* assetType: "dataset",
* bomRef: string,
* description?: string,
* group: string,
* name: string,
* repoId: string,
* url: string,
* }|undefined} normalized dataset metadata
*/
export function normalizeHuggingFaceDataset(dataset, options = {}) {
if (!dataset) {
return undefined;
}
const urlSanitizer =
typeof options.urlSanitizer === "function"
? options.urlSanitizer
: sanitizeBomUrl;
let normalized;
let description;
let rawUrl;
if (typeof dataset === "string") {
normalized = normalizeHuggingFaceReference(
dataset.startsWith("datasets/") ? dataset : `datasets/${dataset}`,
);
} else {
const datasetName =
dataset.name || dataset.id || dataset.type || dataset.path || undefined;
normalized = datasetName
? normalizeHuggingFaceReference(
String(datasetName).includes("/")
? `datasets/${String(datasetName).replace(/^datasets\//u, "")}`
: datasetName,
)
: undefined;
description = [dataset.config, dataset.split].filter(Boolean).join(" / ");
rawUrl = dataset.url;
}
if (!normalized?.repoId) {
return undefined;
}
const [group, name] = normalized.repoId.split("/");
const bomRef = toHuggingFacePurl(
normalized.repoId,
normalized.version,
repositoryUrlForHuggingFaceAssetType("dataset"),
);
const sanitizedUrl = urlSanitizer(rawUrl);
return {
assetType: "dataset",
bomRef,
description: sanitizeBomPropertyValue(
"cdx:huggingface:datasetDescription",
description,
),
group,
name,
purl: bomRef,
repoId: normalized.repoId,
url: sanitizedUrl || toHuggingFaceAssetUrl("dataset", normalized.repoId),
};
}
/**
* Create an inline CycloneDX dataset object from Hugging Face model-card metadata.
*
* @param {object|string} dataset dataset reference from model-card metadata
* @param {{ urlSanitizer?: (url: string|undefined) => string|undefined }} [options={}] dataset normalization options
* @returns {{ contents?: { url: string }, description?: string, name: string, type: string }|undefined} inline dataset object
*/
export function createInlineHuggingFaceDataset(dataset, options = {}) {
if (!dataset) {
return undefined;
}
if (typeof dataset === "string") {
const normalized = normalizeHuggingFaceDataset(dataset, options);
return normalized
? {
type: "dataset",
name: normalized.repoId,
contents: normalized.url ? { url: normalized.url } : undefined,
}
: { type: "dataset", name: dataset };
}
const normalized = normalizeHuggingFaceDataset(dataset, options);
const datasetName =
dataset.name || dataset.id || dataset.type || dataset.path || undefined;
return {
type: "dataset",
name: normalized?.repoId || datasetName,
contents: normalized?.url ? { url: normalized.url } : undefined,
description: normalized?.description,
};
}
/**
* Convert Hugging Face model-index entries into CycloneDX performance metrics.
*
* @param {Array<object>} [modelIndex=[]] model-index entries from model-card metadata
* @returns {Array<{ slice?: string, type: string, value: string }>} CycloneDX performance metrics
*/
export function createPerformanceMetrics(modelIndex = []) {
const metrics = [];
for (const entry of normalizeArray(modelIndex)) {
for (const result of normalizeArray(entry?.results)) {
for (const metric of normalizeArray(result?.metrics)) {
if (!metric?.type && !metric?.name) {
continue;
}
metrics.push({
type: metric.type || metric.name,
value:
metric.value === undefined || metric.value === null
? ""
: String(metric.value),
slice:
[result?.dataset?.name, result?.dataset?.split]
.filter(Boolean)
.join(" / ") || undefined,
});
}
}
}
return metrics.filter((metric) => metric.value);
}
/**
* Derive a human-readable quantization label from a Hugging Face quantization config.
*
* @param {object|string} quantizationConfig Hugging Face quantization configuration
* @returns {string|undefined} normalized quantization label
*/
export function quantizationValueFromConfig(quantizationConfig) {
if (!quantizationConfig) {
return undefined;
}
if (typeof quantizationConfig === "string") {
return quantizationConfig;
}
const bits =
quantizationConfig.bits ||
(quantizationConfig.load_in_4bit ? 4 : undefined) ||
(quantizationConfig.load_in_8bit ? 8 : undefined);
const values = [
quantizationConfig.quant_method,
quantizationConfig.quantization_method,
quantizationConfig.quant_type,
bits ? `${bits}-bit` : undefined,
].filter(Boolean);
return values.length ? values.join(" ") : undefined;
}