playcanvas

/** * A compute-based GPU radix sort implementation using 4-bit radix (16 buckets). * Provides stable sorting of 32-bit unsigned integer keys, returning sorted indices. * WebGPU only. * * **Performance characteristics:** * - 4 passes for 16-bit keys, 8 passes for 32-bit keys * - Each pass processes 4 bits (16 buckets) * - Workgroup size: 16x16 = 256 threads, 8 elements per thread = 2048 elements/workgroup * * **Algorithm (per pass):** * 1. **Histogram**: Each thread extracts 4-bit digits from its elements and * contributes to a per-workgroup histogram using shared memory atomics. * 2. **Prefix Sum**: Hierarchical Blelloch scan on block histograms to compute * global offsets for each (digit, workgroup) pair. * 3. **Ranked Scatter**: Re-reads keys in rounds, computes local ranks using * per-digit 256-bit bitmasks and hardware popcount, then scatters using: * `position = global_prefix[digit][workgroup] + cumulative_local_rank` * * Based on "Fast 4-way parallel radix sorting on GPUs" algorithm, implemented * following [WebGPU-Radix-Sort](https://github.com/kishimisu/WebGPU-Radix-Sort) * by kishimisu (MIT License). * * @example * // Create the radix sort instance (reusable) * const radixSort = new ComputeRadixSort(device); * * // Create a storage buffer with keys to sort * const keys = new Uint32Array([5, 2, 8, 1, 9, 3]); * const keysBuffer = new StorageBuffer(device, keys.byteLength, BUFFERUSAGE_COPY_DST); * keysBuffer.write(keys); * * // Sort and get indices buffer (keys with values [5,2,8,1,9,3] → indices [3,1,5,0,2,4]) * const sortedIndices = radixSort.sort(keysBuffer, keys.length, 16); // 16-bit sort * * // Use sortedIndices buffer in subsequent GPU operations * // Clean up when done * radixSort.destroy(); * * @category Graphics * @ignore */ export class ComputeRadixSort { /** * Creates a new ComputeRadixSort instance. * * @param {GraphicsDevice} device - The graphics device (must support compute). */ constructor(device: GraphicsDevice); /** * The graphics device. * * @type {GraphicsDevice} */ device: GraphicsDevice; /** * Current element count. * * @type {number} */ _elementCount: number; /** * Number of workgroups for current sort. * * @type {number} */ _workgroupCount: number; /** * Allocated workgroup capacity. Tracks the last allocated size; reallocation is triggered * when the effective workgroup count (derived from element count and capacity) differs. * * @type {number} */ _allocatedWorkgroupCount: number; /** * Minimum element capacity for internal buffers. When set, `_allocateBuffers` uses * `max(elementCount, capacity)` as the effective size. The caller can lower this value * to request shrinkage; actual reallocation is deferred to the next sort call. * After allocation, this is updated to reflect the effective element count. * * @type {number} */ capacity: number; /** * Current number of bits for which passes are created. * * @type {number} */ _numBits: number; /** * Internal keys buffer 0 (ping-pong). * * @type {StorageBuffer|null} */ _keys0: StorageBuffer | null; /** * Internal keys buffer 1 (ping-pong). * * @type {StorageBuffer|null} */ _keys1: StorageBuffer | null; /** * Internal values/indices buffer 0 (ping-pong). * * @type {StorageBuffer|null} */ _values0: StorageBuffer | null; /** * Internal values/indices buffer 1 (ping-pong). * * @type {StorageBuffer|null} */ _values1: StorageBuffer | null; /** * Block sums buffer (16 per workgroup). * * @type {StorageBuffer|null} */ _blockSums: StorageBuffer | null; /** * Output sorted indices buffer. * * @type {StorageBuffer|null} */ _sortedIndices: StorageBuffer | null; /** * Prefix sum kernel for block sums. * * @type {PrefixSumKernel|null} */ _prefixSumKernel: PrefixSumKernel | null; /** * Dispatch dimensions. * * @type {Vec2} */ _dispatchSize: Vec2; /** * Cached bind group format for histogram shader (created lazily for current mode). * * @type {BindGroupFormat|null} */ _histogramBindGroupFormat: BindGroupFormat | null; /** * Cached bind group format for reorder shader (created lazily for current mode). * * @type {BindGroupFormat|null} */ _reorderBindGroupFormat: BindGroupFormat | null; /** * Uniform buffer format for runtime uniforms. * * @type {UniformBufferFormat|null} */ _uniformBufferFormat: UniformBufferFormat | null; /** * Cached compute passes. Each entry contains {histogramCompute, reorderCompute} for one pass. * * @type {Array<{histogramCompute: Compute, reorderCompute: Compute}>} */ _passes: Array<{ histogramCompute: Compute; reorderCompute: Compute; }>; /** * Whether the current passes are for indirect sort mode. * * @type {boolean} */ _indirect: boolean; /** * Whether the current passes expect caller-supplied initial values on pass 0. * * @type {boolean} */ _hasInitialValues: boolean; /** * Whether the last pass skips writing sorted keys (only values are written). * When true, `sortedKeys` will contain stale data after sorting. * * @type {boolean} */ _skipLastPassKeyWrite: boolean; /** * Destroys the ComputeRadixSort instance and releases all resources. */ destroy(): void; /** * Destroys all cached passes and their shaders. * * @private */ private _destroyPasses; /** * Destroys internal buffers (not passes or bind group formats). * * @private */ private _destroyBuffers; /** * Gets the sorted indices (or values) buffer. * * @type {StorageBuffer|null} */ get sortedIndices(): StorageBuffer | null; /** * Gets the sorted keys buffer after the last sort operation. The keys end up * in one of the internal ping-pong buffers depending on the number of passes. * * @type {StorageBuffer|null} */ get sortedKeys(): StorageBuffer | null; /** * Ensures bind group formats exist for the given mode. Destroys and recreates * them if switching between direct and indirect modes. * * @param {boolean} indirect - Whether to create indirect sort formats. * @private */ private _ensureBindGroupFormats; /** * Creates cached compute passes for all bit offsets. * * @param {number} numBits - Number of bits to sort. * @param {boolean} indirect - Whether to create indirect sort passes. * @param {boolean} hasInitialValues - Whether pass 0 reads from caller-supplied initial values. * @param {boolean} skipLastPassKeyWrite - Whether the last pass skips writing keys. * @private */ private _createPasses; /** * Allocates or resizes internal buffers and creates passes if needed. * * @param {number} elementCount - Number of elements to sort. * @param {number} numBits - Number of bits to sort. * @param {boolean} indirect - Whether passes should use indirect dispatch. * @param {boolean} hasInitialValues - Whether pass 0 reads caller-supplied initial values. * @param {boolean} skipLastPassKeyWrite - Whether the last pass skips writing keys. * @private */ private _allocateBuffers; /** * Creates a shader with constants embedded. * * @param {string} name - Shader name. * @param {string} source - Shader source. * @param {number} currentBit - Current bit offset for this pass. * @param {boolean} isFirstPass - Whether this is the first pass (uses GID for indices). * @param {BindGroupFormat} bindGroupFormat - Bind group format. * @param {boolean} indirect - Whether to add the USE_INDIRECT_SORT define. * @returns {Shader} The created shader. * @private */ private _createShader; /** * Executes the GPU radix sort using direct dispatch. * * @param {StorageBuffer} keysBuffer - Input storage buffer containing u32 keys. * @param {number} elementCount - Number of elements to sort. * @param {number} [numBits] - Number of bits to sort (must be multiple of 4). Defaults to 16. * @param {StorageBuffer} [initialValues] - Optional buffer of initial values for pass 0. * When provided, the sort produces output values derived from this buffer instead of * sequential indices. The buffer is only read, never modified. * @param {boolean} [skipLastPassKeyWrite] - When true, the last pass skips writing sorted * keys for a small performance gain. Only use when sorted keys are not needed after sorting. * @returns {StorageBuffer} Storage buffer containing sorted indices (or values if * initialValues was provided). */ sort(keysBuffer: StorageBuffer, elementCount: number, numBits?: number, initialValues?: StorageBuffer, skipLastPassKeyWrite?: boolean): StorageBuffer; /** * Executes the GPU radix sort using indirect dispatch. Only sorts `visibleCount` * elements (GPU-written) instead of the full buffer, reducing sort cost proportionally. * * @param {StorageBuffer} keysBuffer - Input storage buffer containing u32 keys. * @param {number} maxElementCount - Maximum number of elements (buffer allocation size). * @param {number} numBits - Number of bits to sort (must be multiple of 4). * @param {number} dispatchSlot - Slot index in the device's indirect dispatch buffer. * @param {StorageBuffer} sortElementCountBuffer - GPU-written buffer containing visible count. * @param {StorageBuffer} [initialValues] - Optional buffer of initial values for pass 0. * When provided, the sort produces output values derived from this buffer instead of * sequential indices. The buffer is only read, never modified. * @param {boolean} [skipLastPassKeyWrite] - When true, the last pass skips writing sorted * keys for a small performance gain. Only use when sorted keys are not needed after sorting. * @returns {StorageBuffer} Storage buffer containing sorted values. */ sortIndirect(keysBuffer: StorageBuffer, maxElementCount: number, numBits: number, dispatchSlot: number, sortElementCountBuffer: StorageBuffer, initialValues?: StorageBuffer, skipLastPassKeyWrite?: boolean): StorageBuffer; /** * Shared execution logic for both direct and indirect radix sort. * * @param {StorageBuffer} keysBuffer - Input keys buffer. * @param {number} elementCount - Number of elements (or max elements for indirect). * @param {number} numBits - Number of bits to sort. * @param {boolean} indirect - Whether to use indirect dispatch. * @param {number} dispatchSlot - Indirect dispatch slot index (-1 for direct). * @param {StorageBuffer|null} sortElementCountBuffer - GPU-written element count (null for direct). * @param {StorageBuffer} [initialValues] - Optional initial values buffer for pass 0. * @param {boolean} [skipLastPassKeyWrite] - When true, the last pass skips writing sorted * keys for a small performance gain. Only use when sorted keys are not needed after sorting. * @returns {StorageBuffer} Storage buffer containing sorted values. * @private */ private _execute; } declare const ELEMENTS_PER_WORKGROUP: number; import type { GraphicsDevice } from '../../platform/graphics/graphics-device.js'; import { StorageBuffer } from '../../platform/graphics/storage-buffer.js'; import { PrefixSumKernel } from './prefix-sum-kernel.js'; import { Vec2 } from '../../core/math/vec2.js'; import { BindGroupFormat } from '../../platform/graphics/bind-group-format.js'; import { UniformBufferFormat } from '../../platform/graphics/uniform-buffer-format.js'; import { Compute } from '../../platform/graphics/compute.js'; export { ELEMENTS_PER_WORKGROUP as RADIX_SORT_ELEMENTS_PER_WORKGROUP };