UNPKG

@huggingface/hub

Version:

Utilities to interact with the Hugging Face hub

599 lines (559 loc) 17.8 kB
import type { BucketDesignation, CredentialsParams, RepoDesignation, RepoId } from "../types/public"; import { checkCredentials } from "../utils/checkCredentials"; import { formatBytes } from "../utils/formatBytes"; import { promisesQueue } from "../utils/promisesQueue"; import { toRepoId } from "../utils/toRepoId"; import { eventToGenerator } from "../utils/eventToGenerator"; import type { CommitOperation, CommitParams } from "./commit"; import { commit } from "./commit"; import { downloadFile } from "./download-file"; import type { ListFileEntry } from "./list-files"; import { listFiles } from "./list-files"; import type { PathInfo } from "./paths-info"; import { pathsInfo } from "./paths-info"; /** * Progress events yielded by {@link copyFileIter} / {@link copyFilesIter} / {@link copyFolderIter}. * * Currently only `fileDownloaded` is emitted: one event per source file that had to be downloaded * (small git-stored files that can't be copied server-side). Xet-backed files are copied * server-side and do not produce events. */ export interface CopyProgressEvent { event: "fileDownloaded"; /** Source path of the file that was just downloaded. */ path: string; /** Number of files downloaded so far (including this one). */ downloaded: number; /** Total number of files that will be downloaded. */ total: number; } const DOWNLOAD_CONCURRENCY = 5; const PATHS_INFO_BATCH_SIZE = 100; const MAX_REPORTED_LFS_PATHS = 5; /** * Source location of a file in {@link copyFile} / {@link copyFiles} / {@link copyFolder}. */ export interface CopySource { repo: RepoDesignation; /** * Path of the file (or folder, for {@link copyFolder}) inside the source repo. * Leave empty in {@link copyFolder} to copy the whole repo. */ path: string; /** * Git revision to read the source from. Ignored for bucket sources. * * @default "main" */ revision?: string; } /** * Destination location for {@link copyFile} / {@link copyFolder}. * * The destination repo must be a bucket — server-side copy is currently only supported * towards buckets. */ export interface CopyDestination { repo: BucketDesignation; /** * Exact destination path within the destination bucket. For {@link copyFolder}, * acts as a prefix; leave empty to copy under the bucket root. */ path: string; } /** * One file to copy in a {@link copyFiles} call. */ export interface CopyFilesEntry { source: CopySource; /** * Exact path within the destination bucket. The bucket itself is shared with the * other entries via the top-level {@link copyFiles} `destination` parameter. */ destinationPath: string; } type SharedParams = { hubUrl?: CommitParams["hubUrl"]; fetch?: CommitParams["fetch"]; abortSignal?: CommitParams["abortSignal"]; } & Partial<CredentialsParams>; /** * Copy a single file from a source repo/bucket to the destination bucket. * * The copy is server-side (no data transfer) when the source file is xet-backed. * For small non-xet repo files (e.g. `config.json`) the file is downloaded and * re-uploaded to the destination bucket in the same commit. * * LFS pointer files that have not been migrated to xet are rejected up front * (they would otherwise require downloading the full LFS blob). * * @example * ```ts * await copyFile({ * source: { * repo: { type: "model", name: "username/my-model" }, * path: "model.safetensors", * }, * destination: { * repo: { type: "bucket", name: "username/my-bucket" }, * path: "models/my-model/model.safetensors", * }, * accessToken: "hf_...", * }); * ``` */ export function copyFile( params: { source: CopySource; destination: CopyDestination; } & SharedParams, ): Promise<undefined> { return copyFiles({ ...(params.accessToken ? { accessToken: params.accessToken } : { credentials: params.credentials }), destination: params.destination.repo, files: [ { source: params.source, destinationPath: params.destination.path, }, ], hubUrl: params.hubUrl, fetch: params.fetch, abortSignal: params.abortSignal, }); } /** * Async-iterator variant of {@link copyFile} that yields {@link CopyProgressEvent}s while * downloading non-xet source files (xet-backed files are copied server-side and do not * emit events). See {@link copyFile} for the semantics. * * @example * ```ts * for await (const event of copyFileIter({ source, destination, accessToken })) { * console.log(`downloaded ${event.path} (${event.downloaded}/${event.total})`); * } * ``` */ export function copyFileIter( params: { source: CopySource; destination: CopyDestination; } & SharedParams, ): AsyncGenerator<CopyProgressEvent, undefined> { return copyFilesIter({ ...(params.accessToken ? { accessToken: params.accessToken } : { credentials: params.credentials }), destination: params.destination.repo, files: [ { source: params.source, destinationPath: params.destination.path, }, ], hubUrl: params.hubUrl, fetch: params.fetch, abortSignal: params.abortSignal, }); } /** * Copy multiple files (potentially from different source repos/buckets) to the destination * bucket in a single commit. * * For xet-backed source files, the copy is performed server-side with no data transfer. * For non-xet source files (typically small git-stored repo files), the file is * downloaded and re-uploaded as part of the same commit. * * LFS pointer files that have not been migrated to xet are rejected up front. * * @example * ```ts * await copyFiles({ * destination: { type: "bucket", name: "username/my-bucket" }, * files: [ * { * source: { * repo: { type: "bucket", name: "username/other-bucket" }, * path: "data.bin", * }, * destinationPath: "data.bin", * }, * { * source: { * repo: { type: "model", name: "username/my-model" }, * path: "model.safetensors", * }, * destinationPath: "models/my-model/model.safetensors", * }, * ], * accessToken: "hf_...", * }); * ``` */ export async function copyFiles( params: { destination: BucketDesignation; files: CopyFilesEntry[]; } & SharedParams, ): Promise<undefined> { const iterator = copyFilesIter(params); while (true) { const res = await iterator.next(); if (res.done) { return undefined; } } } /** * Async-iterator variant of {@link copyFiles} that yields {@link CopyProgressEvent}s while * downloading non-xet source files (xet-backed files are copied server-side and do not * emit events). See {@link copyFiles} for the semantics. */ export async function* copyFilesIter( params: { destination: BucketDesignation; files: CopyFilesEntry[]; } & SharedParams, ): AsyncGenerator<CopyProgressEvent, undefined> { if (params.files.length === 0) { return undefined; } const operations = yield* resolveCopyOperationsIter(params, params.files); await commit({ ...(params.accessToken ? { accessToken: params.accessToken } : { credentials: params.credentials }), repo: params.destination, operations, title: "", hubUrl: params.hubUrl, fetch: params.fetch, abortSignal: params.abortSignal, }); return undefined; } /** * Copy a folder (recursively) from a source repo/bucket to the destination bucket * in a single commit. * * Per-file paths are resolved relative to {@link CopySource.path}; the source folder * itself is not preserved in the destination unless {@link CopyDestination.path} * keeps it. * * @example * ```ts * // Copy an entire dataset under "datasets/my-dataset/" in the bucket * await copyFolder({ * source: { repo: { type: "dataset", name: "username/my-dataset" } }, * destination: { * repo: { type: "bucket", name: "username/my-bucket" }, * path: "datasets/my-dataset/", * }, * accessToken: "hf_...", * }); * * // Copy a subfolder * await copyFolder({ * source: { * repo: { type: "bucket", name: "username/src-bucket" }, * path: "models/", * }, * destination: { * repo: { type: "bucket", name: "username/dst-bucket" }, * path: "backup/", * }, * accessToken: "hf_...", * }); * ``` */ export async function copyFolder( params: { source: Omit<CopySource, "path"> & { path?: string }; destination: Omit<CopyDestination, "path"> & { path?: string }; } & SharedParams, ): Promise<undefined> { const iterator = copyFolderIter(params); while (true) { const res = await iterator.next(); if (res.done) { return undefined; } } } /** * Async-iterator variant of {@link copyFolder} that yields {@link CopyProgressEvent}s while * downloading non-xet source files (xet-backed files are copied server-side and do not * emit events). See {@link copyFolder} for the semantics. */ export async function* copyFolderIter( params: { source: Omit<CopySource, "path"> & { path?: string }; destination: Omit<CopyDestination, "path"> & { path?: string }; } & SharedParams, ): AsyncGenerator<CopyProgressEvent, undefined> { const accessToken = checkCredentials(params); const sourceRepoId = toRepoId(params.source.repo); const sourcePath = (params.source.path ?? "").replace(/\/+$/, ""); const destinationPrefix = (params.destination.path ?? "").replace(/\/+$/, ""); const sourceRevision = sourceRepoId.type === "bucket" ? undefined : (params.source.revision ?? "main"); const operations: CommitOperation[] = []; const pendingDownloads: PendingDownload[] = []; const lfsOffenders: Array<{ path: string; size: number }> = []; for await (const item of listFiles({ repo: sourceRepoId, path: sourcePath || undefined, recursive: true, revision: sourceRevision, accessToken, hubUrl: params.hubUrl, fetch: params.fetch, })) { if (item.type !== "file") { continue; } const relPath = relativeUnderFolder(item.path, sourcePath); const destPath = destinationPrefix ? `${destinationPrefix}/${relPath}` : relPath; switch (classifySourceFile(item)) { case "copy": operations.push({ operation: "copy", path: destPath, sourceXetHash: item.xetHash as string, sourceRepo: sourceRepoId, }); continue; case "lfs": lfsOffenders.push({ path: item.path, size: item.lfs?.size ?? item.size }); continue; case "download": // Regular git-stored file (small): download + re-upload in the same commit. pendingDownloads.push({ index: operations.length, repoId: sourceRepoId, revision: sourceRevision, sourcePath: item.path, }); operations.push({ operation: "addOrUpdate", path: destPath, content: new Blob([]), }); continue; } } if (lfsOffenders.length > 0) { throwUnmigratedLfsError(sourceRepoId, lfsOffenders); } if (operations.length === 0) { return undefined; } yield* downloadAndFillBlobsIter({ pendingDownloads, operations, accessToken, hubUrl: params.hubUrl, fetch: params.fetch, }); await commit({ ...(params.accessToken ? { accessToken: params.accessToken } : { credentials: params.credentials }), repo: params.destination.repo, operations, title: "", hubUrl: params.hubUrl, fetch: params.fetch, abortSignal: params.abortSignal, }); return undefined; } /** * Resolve a list of {@link CopyFilesEntry} entries into `CommitOperation`s, batching * `pathsInfo` calls per source repo and parallelizing downloads for non-xet files. * Yields one {@link CopyProgressEvent} per downloaded file. */ async function* resolveCopyOperationsIter( shared: SharedParams, files: CopyFilesEntry[], ): AsyncGenerator<CopyProgressEvent, CommitOperation[]> { const accessToken = checkCredentials(shared); // Group files by (source repo, source revision) so we can batch pathsInfo calls. const groups = new Map< string, { repoId: RepoId; revision: string | undefined; entries: Array<{ index: number; file: CopyFilesEntry }>; } >(); for (let i = 0; i < files.length; i++) { const file = files[i]; const repoId = toRepoId(file.source.repo); const revision = repoId.type === "bucket" ? undefined : (file.source.revision ?? "main"); const key = `${repoId.type}\0${repoId.name}\0${revision ?? ""}`; let group = groups.get(key); if (!group) { group = { repoId, revision, entries: [] }; groups.set(key, group); } group.entries.push({ index: i, file }); } const operations: CommitOperation[] = new Array(files.length); const pendingDownloads: PendingDownload[] = []; for (const group of groups.values()) { const paths = group.entries.map((e) => e.file.source.path); const infos: Awaited<ReturnType<typeof pathsInfo>> = []; for (let offset = 0; offset < paths.length; offset += PATHS_INFO_BATCH_SIZE) { const slice = paths.slice(offset, offset + PATHS_INFO_BATCH_SIZE); const res = await pathsInfo({ repo: group.repoId, paths: slice, revision: group.revision, accessToken, hubUrl: shared.hubUrl, fetch: shared.fetch, }); infos.push(...res); } const infoByPath = new Map(infos.map((i) => [i.path, i])); const lfsOffenders: Array<{ path: string; size: number }> = []; for (const { index, file } of group.entries) { const info = infoByPath.get(file.source.path); if (!info) { throw new Error(`Source file not found: '${file.source.path}' in ${group.repoId.type}s/${group.repoId.name}`); } if (info.type !== "file") { throw new Error( `Source path '${file.source.path}' in ${group.repoId.type}s/${group.repoId.name} is a folder; use copyFolder() instead.`, ); } switch (classifySourceFile(info)) { case "copy": operations[index] = { operation: "copy", path: file.destinationPath, sourceXetHash: info.xetHash as string, sourceRepo: group.repoId, }; continue; case "lfs": lfsOffenders.push({ path: file.source.path, size: info.lfs?.size ?? info.size }); continue; case "download": pendingDownloads.push({ index, repoId: group.repoId, revision: group.revision, sourcePath: file.source.path, }); operations[index] = { operation: "addOrUpdate", path: file.destinationPath, content: new Blob([]), }; continue; } } if (lfsOffenders.length > 0) { throwUnmigratedLfsError(group.repoId, lfsOffenders); } } yield* downloadAndFillBlobsIter({ pendingDownloads, operations, accessToken, hubUrl: shared.hubUrl, fetch: shared.fetch, }); return operations; } interface PendingDownload { index: number; repoId: RepoId; revision: string | undefined; sourcePath: string; } /** * Download all `pendingDownloads` in parallel and fill the matching `addOrUpdate` * placeholder ops in `operations` with the downloaded blob. Yields one * {@link CopyProgressEvent} per file as it completes. No-op if the list is empty. */ function downloadAndFillBlobsIter(args: { pendingDownloads: PendingDownload[]; operations: CommitOperation[]; accessToken: string | undefined; hubUrl: string | undefined; fetch: typeof fetch | undefined; }): AsyncGenerator<CopyProgressEvent, void> { const total = args.pendingDownloads.length; return eventToGenerator<CopyProgressEvent, void>((yieldCallback, returnCallback, rejectCallback) => { if (total === 0) { returnCallback(); return; } let downloaded = 0; promisesQueue( args.pendingDownloads.map(({ index, repoId, revision, sourcePath }) => async () => { const blob = await downloadFile({ repo: repoId, path: sourcePath, revision, accessToken: args.accessToken, hubUrl: args.hubUrl, fetch: args.fetch, }); if (!blob) { throw new Error(`Failed to download '${sourcePath}' from ${repoId.type}s/${repoId.name}`); } const op = args.operations[index]; if (op.operation !== "addOrUpdate") { throw new Error("Internal: expected addOrUpdate placeholder operation"); } op.content = blob; downloaded++; yieldCallback({ event: "fileDownloaded", path: sourcePath, downloaded, total }); }), DOWNLOAD_CONCURRENCY, ).then( () => returnCallback(), (err) => rejectCallback(err), ); }); } /** * Compute the path of `filePath` relative to `folderPath`. Used to map source paths * under a folder being copied to destination paths under the new prefix. */ export function relativeUnderFolder(filePath: string, folderPath: string): string { if (!folderPath) { return filePath; } if (filePath === folderPath) { return filePath.split("/").pop() ?? filePath; } if (filePath.startsWith(folderPath + "/")) { return filePath.slice(folderPath.length + 1); } throw new Error(`Path '${filePath}' is not inside folder '${folderPath}'`); } /** * Decide how to handle a source file in the copy pipeline: * - `"copy"`: xet-backed, can be copied server-side. * - `"download"`: regular git-stored file, safe to download + re-upload. * - `"lfs"`: LFS pointer file that has not been migrated to xet. We refuse to copy these * because they can be arbitrarily large; the caller should migrate them to xet first. */ function classifySourceFile(file: ListFileEntry | PathInfo): "copy" | "download" | "lfs" { if (file.xetHash) { return "copy"; } if (file.lfs) { return "lfs"; } return "download"; } function throwUnmigratedLfsError(repoId: RepoId, entries: Array<{ path: string; size: number }>): never { const head = entries .slice(0, MAX_REPORTED_LFS_PATHS) .map((e) => `'${e.path}' (${formatBytes(e.size)})`) .join(", "); const more = entries.length > MAX_REPORTED_LFS_PATHS ? ` (and ${entries.length - MAX_REPORTED_LFS_PATHS} more)` : ""; throw new Error( `Cannot copy ${entries.length} LFS file(s) from ${repoId.type}s/${repoId.name} that have not been migrated to xet: ${head}${more}. ` + `Migrate these files to xet before copying.`, ); }