pdq-wasm
Version:
WebAssembly bindings for Meta's PDQ perceptual image hashing algorithm
672 lines (671 loc) • 24.1 kB
JavaScript
/**
* Browser-specific utilities for PDQ perceptual hashing
* These utilities provide convenient helpers for working with PDQ in browser environments
*/
import { PDQ } from './pdq.js';
/**
* Detect the current runtime environment and recommend the appropriate API
*
* @returns Environment information and API recommendations
*
* @example
* ```typescript
* import { getEnvironment } from 'pdq-wasm/browser';
*
* const env = getEnvironment();
* console.log(`Running in: ${env.type}`);
* console.log(`Use: ${env.recommendedAPI}`);
*
* if (env.supportsBlob) {
* const hash = await generateHashFromBlob(file);
* } else if (env.supportsDataUrl) {
* const hash = await generateHashFromDataUrl(dataUrl);
* }
* ```
*/
export function getEnvironment() {
// Check for Web Worker
const isWorker = typeof self !== 'undefined' &&
// @ts-ignore - importScripts only exists in workers
typeof importScripts === 'function' &&
typeof window === 'undefined';
// Check for browser main thread
const isBrowser = typeof window !== 'undefined' &&
typeof document !== 'undefined';
// Check for Node.js
const isNode = typeof process !== 'undefined' &&
process.versions != null &&
process.versions.node != null;
// Check API availability
const supportsBlob = typeof createImageBitmap !== 'undefined' &&
typeof OffscreenCanvas !== 'undefined';
const supportsDataUrl = typeof window !== 'undefined' &&
typeof document !== 'undefined' &&
typeof Image !== 'undefined';
// Determine environment type
let type;
let recommendedAPI;
if (isWorker) {
type = 'worker';
recommendedAPI = supportsBlob ? 'generateHashFromBlob()' : 'Not supported - upgrade browser';
}
else if (isBrowser) {
type = 'browser';
// Prefer blob API for better performance and worker compatibility
recommendedAPI = supportsBlob ? 'generateHashFromBlob()' : 'generateHashFromDataUrl()';
}
else if (isNode) {
type = 'node';
recommendedAPI = 'PDQ.hash() with image buffer';
}
else {
type = 'unknown';
recommendedAPI = 'Unknown environment';
}
return {
type,
supportsDataUrl,
supportsBlob,
recommendedAPI,
};
}
/**
* Simple LRU (Least Recently Used) cache implementation
* @internal
*/
class LRUCache {
constructor(maxSize) {
this.cache = new Map();
this.maxSize = maxSize;
}
get(key) {
const value = this.cache.get(key);
if (value !== undefined) {
// Move to end (most recently used)
this.cache.delete(key);
this.cache.set(key, value);
}
return value;
}
set(key, value) {
// Delete if exists to update position
this.cache.delete(key);
// Add to end (most recently used)
this.cache.set(key, value);
// Evict least recently used if over size
if (this.cache.size > this.maxSize) {
const firstKey = this.cache.keys().next().value;
if (firstKey !== undefined) {
this.cache.delete(firstKey);
}
}
}
has(key) {
return this.cache.has(key);
}
delete(key) {
return this.cache.delete(key);
}
clear() {
this.cache.clear();
}
get size() {
return this.cache.size;
}
}
/**
* Creates a hash existence checker with a custom lookup function
* Supports fluent API for validation and caching behavior
*
* @param lookup - Function that checks if a hash exists in your storage system
* @returns A hash checker function with chainable modifiers
*
* @example
* ```typescript
* // Simple checker - throws on invalid hash
* const checkHash = createHashChecker(async (hash) => {
* const { data } = await supabase.rpc('check_hash_exists', { p_hash: hash });
* return data;
* });
*
* const result = await checkHash(myHash);
* ```
*
* @example
* ```typescript
* // With REST API
* const checkHash = createHashChecker(async (hash) => {
* const response = await fetch(`/api/hashes/${hash}`);
* return response.json();
* });
* ```
*
* @example
* ```typescript
* // Gracefully ignore invalid hashes
* const checkHash = createHashChecker(lookup).ignoreInvalid();
*
* // Invalid hash returns { exists: false } instead of throwing
* const result = await checkHash('invalid-hash');
* ```
*
* @example
* ```typescript
* // Cached with TTL
* const checkHash = createHashChecker(lookup).cached(5 * 60 * 1000);
*
* // First call hits the database
* await checkHash(hash1);
*
* // Second call within 5 minutes uses cached result
* await checkHash(hash1);
* ```
*
* @example
* ```typescript
* // Combined - ignore invalid + cached
* const checkHash = createHashChecker(lookup)
* .ignoreInvalid()
* .cached(60 * 60 * 1000); // 1 hour cache
*
* // Clear cache when needed
* checkHash.clearCache?.();
* ```
*/
export function createHashChecker(lookup) {
return createCheckerWithOptions(lookup, {
ignoreInvalid: false,
cached: false,
cache: new LRUCache(1000), // Default max size
cacheTTL: Infinity,
cacheMaxSize: 1000
});
}
/**
* Internal function to create checker with specific options
* @internal
*/
function createCheckerWithOptions(lookup, options) {
const checker = async (hash) => {
// Validate hash format (PDQ hashes are 64 hex characters)
const isValid = typeof hash === 'string' &&
hash.length === 64 &&
/^[0-9a-f]{64}$/i.test(hash);
if (!isValid) {
if (options.ignoreInvalid) {
return { exists: false, existing: null };
}
throw new Error('Invalid PDQ hash: must be 64 hexadecimal characters');
}
// Normalize hash to lowercase for cache key consistency
const normalizedHash = hash.toLowerCase();
// Check cache if enabled
if (options.cached && options.cache.has(normalizedHash)) {
const cached = options.cache.get(normalizedHash);
const age = Date.now() - cached.timestamp;
if (age < options.cacheTTL) {
return cached.result;
}
// Expired, remove it
options.cache.delete(normalizedHash);
}
// Perform lookup
const result = await lookup(normalizedHash);
// Store in cache if enabled
if (options.cached) {
options.cache.set(normalizedHash, {
result,
timestamp: Date.now()
});
}
return result;
};
// Attach chainable methods (cast once for type safety)
const hashChecker = checker;
hashChecker.ignoreInvalid = () => {
return createCheckerWithOptions(lookup, {
...options,
ignoreInvalid: true
});
};
hashChecker.cached = (ttl = Infinity, maxSize = 1000) => {
return createCheckerWithOptions(lookup, {
...options,
cached: true,
cacheTTL: ttl,
cache: new LRUCache(maxSize),
cacheMaxSize: maxSize
});
};
// Add cache management for cached checkers
if (options.cached) {
hashChecker.clearCache = () => {
options.cache.clear();
};
}
return hashChecker;
}
/**
* Calculate Hamming distance between two PDQ hash strings (hex format)
* Convenience wrapper around PDQ.hammingDistance that works with hex strings
*
* @param hash1 - First PDQ hash (64 hex characters)
* @param hash2 - Second PDQ hash (64 hex characters)
* @returns Hamming distance (0-256, where 0 = identical, 256 = completely different)
*
* @throws Error if either hash is not 64 hex characters
*
* @example
* ```typescript
* const hash1 = 'a1b2c3d4...'; // 64 hex chars
* const hash2 = 'e5f6g7h8...'; // 64 hex chars
*
* const distance = hammingDistance(hash1, hash2);
* console.log(`Distance: ${distance} bits`);
*
* if (distance <= 31) {
* console.log('Images are likely duplicates');
* }
* ```
*/
export function hammingDistance(hash1, hash2) {
if (hash1.length !== 64 || hash2.length !== 64) {
throw new Error('PDQ hashes must be exactly 64 hex characters');
}
if (!/^[0-9a-f]{64}$/i.test(hash1) || !/^[0-9a-f]{64}$/i.test(hash2)) {
throw new Error('PDQ hashes must contain only hexadecimal characters');
}
const arr1 = PDQ.fromHex(hash1);
const arr2 = PDQ.fromHex(hash2);
return PDQ.hammingDistance(arr1, arr2);
}
/**
* Generate PDQ hash from a Blob or File in a worker-compatible way
*
* **✅ RECOMMENDED** - Works in both browser main thread and Web Workers
*
* Uses modern browser APIs (createImageBitmap + OffscreenCanvas) that work across contexts.
* This is the **preferred API** for most use cases, especially if you need Web Worker support.
*
* **For Workers:** This is the ONLY API that works - {@link generateHashFromDataUrl} will fail.
*
* **Browser Support:**
* - Chrome 69+ (full support)
* - Firefox 105+ (OffscreenCanvas added in 105)
* - Safari 16.4+ (OffscreenCanvas support)
* - Edge 79+
*
* **For older browsers** (main thread only), use {@link generateHashFromDataUrl} as fallback.
*
* @param blob Image blob or file
* @returns Hex-encoded PDQ hash (64 character hex string)
* @throws {Error} If createImageBitmap or OffscreenCanvas unavailable
* @throws {Error} If image fails to decode or has invalid dimensions
* @throws {Error} If image exceeds maximum dimension limit (10,000px)
*
* @see {@link generateHashFromDataUrl} for legacy browser fallback (main thread only)
* @see {@link getEnvironment} to detect runtime environment and choose the right API
*
* @example
* ```typescript
* // ✅ In a Web Worker (PREFERRED)
* self.onmessage = async (event) => {
* const file = event.data.file;
* const hash = await generateHashFromBlob(file);
* self.postMessage({ hash });
* };
* ```
*
* @example
* ```typescript
* // ✅ In a browser main thread (also works)
* const fileInput = document.querySelector('input[type="file"]');
* const file = fileInput.files[0];
* const hash = await generateHashFromBlob(file);
* console.log('Hash:', hash);
* ```
*
* @example
* ```typescript
* // ✅ With fetch API
* const response = await fetch('image.jpg');
* const blob = await response.blob();
* const hash = await generateHashFromBlob(blob);
* ```
*/
export async function generateHashFromBlob(blob) {
// Check if createImageBitmap is available (works in both browsers and workers)
if (typeof createImageBitmap === 'undefined') {
throw new Error('createImageBitmap is not available in this environment. ' +
'For older browsers (main thread), use generateHashFromDataUrl() instead. ' +
'Minimum browser versions: Chrome 69+, Firefox 105+, Safari 16.4+, Edge 79+.');
}
// Check if OffscreenCanvas is available
if (typeof OffscreenCanvas === 'undefined') {
throw new Error('OffscreenCanvas is not available in this environment. ' +
'For older browsers (main thread), use generateHashFromDataUrl() as a fallback. ' +
'Minimum browser versions: Chrome 69+, Firefox 105+, Safari 16.4+, Edge 79+.');
}
// Create an ImageBitmap from the blob with proper error handling
let imageBitmap;
try {
imageBitmap = await createImageBitmap(blob);
}
catch (error) {
throw new Error(`Failed to decode image: ${error instanceof Error ? error.message : 'Unknown error'}. ` +
'The file may be corrupted or in an unsupported format.');
}
// Validate image dimensions
if (!imageBitmap.width || !imageBitmap.height) {
imageBitmap.close();
throw new Error('Image has invalid dimensions (width or height is 0)');
}
// Add size limits to prevent DOS attacks via huge images
const MAX_DIMENSION = 10000; // 10,000 pixels max on any side
if (imageBitmap.width > MAX_DIMENSION || imageBitmap.height > MAX_DIMENSION) {
imageBitmap.close();
throw new Error(`Image too large: ${imageBitmap.width}x${imageBitmap.height} pixels. ` +
`Maximum allowed: ${MAX_DIMENSION}x${MAX_DIMENSION}`);
}
try {
// Create an OffscreenCanvas to extract pixel data
// NOTE: For performance optimization in high-throughput scenarios, consider
// caching and reusing a single OffscreenCanvas and 2D context. However, this
// would require careful size management and thread safety considerations in workers.
const canvas = new OffscreenCanvas(imageBitmap.width, imageBitmap.height);
const ctx = canvas.getContext('2d');
if (!ctx) {
throw new Error('Could not get 2D context from OffscreenCanvas');
}
// Draw the image to the canvas
ctx.drawImage(imageBitmap, 0, 0);
// Get image data (RGBA format)
const imageData = ctx.getImageData(0, 0, imageBitmap.width, imageBitmap.height);
// Convert RGBA to RGB (PDQ only supports RGB or grayscale)
const rgbData = new Uint8Array(imageBitmap.width * imageBitmap.height * 3);
for (let i = 0, j = 0; i < imageData.data.length; i += 4, j += 3) {
rgbData[j] = imageData.data[i]; // R
rgbData[j + 1] = imageData.data[i + 1]; // G
rgbData[j + 2] = imageData.data[i + 2]; // B
// Skip alpha channel
}
// Prepare PDQ image data structure
const pdqImageData = {
data: rgbData,
width: imageBitmap.width,
height: imageBitmap.height,
channels: 3 // RGB
};
// Generate PDQ hash
const result = PDQ.hash(pdqImageData);
return PDQ.toHex(result.hash);
}
finally {
// Clean up the ImageBitmap (check if close() exists for compatibility)
if (typeof imageBitmap.close === 'function') {
imageBitmap.close();
}
}
}
/**
* Generate PDQ perceptual hash from an image data URL or blob URL
*
* **⚠️ BROWSER MAIN THREAD ONLY** - Requires DOM APIs (Image, Canvas, document)
*
* **For Web Workers:** Use {@link generateHashFromBlob} instead, which works in both
* browsers and workers using modern APIs (createImageBitmap + OffscreenCanvas).
*
* **Migration Guide:**
* ```typescript
* // ❌ DON'T use in workers (will throw error)
* const hash = await generateHashFromDataUrl(dataUrl);
*
* // ✅ DO use in workers
* const hash = await generateHashFromBlob(file); // file is Blob/File
* ```
*
* **Auto-cleanup:** Blob URLs can be automatically revoked after processing using the
* `autoRevoke` parameter to prevent memory leaks. Useful when you don't need the blob
* URL for preview display. Data URLs (data:image/...) are never revoked.
*
* @param dataUrl - Image data URL (data:image/...) or blob URL (blob:...)
* @param autoRevoke - Automatically revoke blob URLs after processing (default: false)
* @returns Promise resolving to 64-character hex hash string
*
* @throws Error if called in non-browser main thread environment (e.g., Web Worker, Node.js)
* @throws Error if image fails to load
* @throws Error if canvas context cannot be obtained
*
* @see {@link generateHashFromBlob} for worker-compatible alternative
* @see {@link getEnvironment} to detect runtime environment and choose the right API
*
* @example
* ```typescript
* // Auto-revoke blob URL (when you don't need it for display)
* const file = input.files[0];
* const blobUrl = URL.createObjectURL(file);
* const hash = await generateHashFromDataUrl(blobUrl, true);
* // Blob URL automatically revoked!
* ```
*
* @example
* ```typescript
* // Keep blob URL for preview (manual revocation required)
* const blobUrl = URL.createObjectURL(file);
* const hash = await generateHashFromDataUrl(blobUrl, false);
* // Display preview using blobUrl...
* // Later: URL.revokeObjectURL(blobUrl);
* ```
*
* @example
* ```typescript
* // From canvas (data URLs don't need revocation)
* const canvas = document.getElementById('myCanvas');
* const dataUrl = canvas.toDataURL('image/png');
* const hash = await generateHashFromDataUrl(dataUrl);
* ```
*/
export async function generateHashFromDataUrl(dataUrl, autoRevoke = false) {
// Check if we're in a browser environment using the centralized environment detection
const env = getEnvironment();
if (env.type !== 'browser') {
throw new Error('generateHashFromDataUrl() requires browser main thread (needs DOM APIs). ' +
(env.type === 'worker'
? 'For Web Workers, use generateHashFromBlob() instead, which uses worker-compatible APIs (createImageBitmap + OffscreenCanvas). ' +
'Example: const hash = await generateHashFromBlob(file);'
: 'For Node.js, use the core PDQ API with image buffers. ') +
'See examples/worker/README.md for more details.');
}
// Check if this is a blob URL and should be auto-revoked
const isBlobUrl = dataUrl.startsWith('blob:');
const shouldRevoke = isBlobUrl && autoRevoke;
return new Promise((resolve, reject) => {
const img = new Image();
img.crossOrigin = 'anonymous';
img.onload = () => {
try {
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
if (!ctx) {
// Revoke blob URL on error if auto-revoke enabled
if (shouldRevoke) {
URL.revokeObjectURL(dataUrl);
}
reject(new Error('Could not get canvas context'));
return;
}
// Use original image dimensions
canvas.width = img.width;
canvas.height = img.height;
// Draw image
ctx.drawImage(img, 0, 0);
// Get image data (RGBA format from canvas)
const imageData = ctx.getImageData(0, 0, img.width, img.height);
// Convert RGBA to RGB (PDQ only supports RGB or grayscale)
const rgbData = new Uint8Array(img.width * img.height * 3);
for (let i = 0, j = 0; i < imageData.data.length; i += 4, j += 3) {
rgbData[j] = imageData.data[i]; // R
rgbData[j + 1] = imageData.data[i + 1]; // G
rgbData[j + 2] = imageData.data[i + 2]; // B
// Skip alpha channel
}
// Prepare PDQ image data structure
const pdqImageData = {
data: rgbData,
width: img.width,
height: img.height,
channels: 3 // RGB
};
// Generate PDQ hash
const result = PDQ.hash(pdqImageData);
const hexHash = PDQ.toHex(result.hash);
// Revoke blob URL after successful processing if enabled
if (shouldRevoke) {
URL.revokeObjectURL(dataUrl);
}
resolve(hexHash);
}
catch (error) {
// Revoke blob URL on error if auto-revoke enabled
if (shouldRevoke) {
URL.revokeObjectURL(dataUrl);
}
reject(error);
}
};
img.onerror = () => {
// Revoke blob URL on load error if auto-revoke enabled
if (shouldRevoke) {
URL.revokeObjectURL(dataUrl);
}
reject(new Error('Failed to load image'));
};
img.src = dataUrl;
});
}
/**
* Detect duplicate images by comparing PDQ perceptual hashes
* Generates hashes for all images and finds groups of similar images
*
* @param files - Array of files with preview URLs
* @param threshold - Hamming distance threshold for duplicates (default: 31, PDQ recommended)
* @param onProgress - Optional callback for progress updates
* @returns Promise resolving to array of duplicate groups
*
* @example
* ```typescript
* const files = [
* { id: '1', name: 'photo1.jpg', preview: 'blob:...', type: 'image/jpeg' },
* { id: '2', name: 'photo2.jpg', preview: 'blob:...', type: 'image/jpeg' },
* ];
*
* const duplicates = await detectDuplicatesByHash(files);
*
* duplicates.forEach(group => {
* console.log('Duplicate group:');
* group.forEach(file => console.log(` - ${file.name}`));
* });
* ```
*
* @example
* ```typescript
* // With progress callback
* const duplicates = await detectDuplicatesByHash(
* files,
* 31,
* (progress) => {
* console.log(`${progress.processedFiles}/${progress.totalFiles} processed`);
* console.log(`Currently processing: ${progress.currentFile}`);
* console.log(`Duplicates found: ${progress.duplicatesFound}`);
* }
* );
* ```
*
* @example
* ```typescript
* // Custom threshold (more strict)
* const duplicates = await detectDuplicatesByHash(files, 15);
* ```
*/
export async function detectDuplicatesByHash(files, threshold = 31, onProgress) {
// Filter only image files
const imageFiles = files.filter(file => file.type.startsWith('image/') && file.preview);
if (imageFiles.length < 2) {
return [];
}
// Initialize progress tracking
let processedFiles = 0;
let duplicatesFound = 0;
// Generate hashes for all images, tracking any errors
const filesWithHashes = await Promise.all(imageFiles.map(async (file) => {
const newMeta = { hash: null };
try {
newMeta.hash = await generateHashFromDataUrl(file.preview);
}
catch (error) {
newMeta.hashError = error instanceof Error ? error.message : String(error);
}
processedFiles++;
// Report progress after processing (single progress update per file)
if (onProgress) {
onProgress({
totalFiles: imageFiles.length,
processedFiles,
currentFile: file.name,
duplicatesFound
});
}
return {
...file,
meta: {
...file.meta,
...newMeta
}
};
}));
// Find duplicates by comparing hashes
const duplicateGroups = [];
const processed = new Set();
for (let i = 0; i < filesWithHashes.length; i++) {
const file1 = filesWithHashes[i];
if (!file1.meta?.hash || processed.has(file1.id))
continue;
const group = [file1];
processed.add(file1.id);
for (let j = i + 1; j < filesWithHashes.length; j++) {
const file2 = filesWithHashes[j];
if (!file2.meta?.hash || processed.has(file2.id))
continue;
// Calculate PDQ hamming distance (synchronous - no I/O)
const distance = hammingDistance(file1.meta.hash, file2.meta.hash);
if (distance <= threshold) {
group.push(file2);
processed.add(file2.id);
}
}
// Only add groups with duplicates
if (group.length > 1) {
duplicateGroups.push(group);
duplicatesFound += group.length;
// Report updated duplicate count
if (onProgress) {
onProgress({
totalFiles: imageFiles.length,
processedFiles: imageFiles.length,
currentFile: '',
duplicatesFound
});
}
}
}
// Final progress report
if (onProgress) {
onProgress({
totalFiles: imageFiles.length,
processedFiles: imageFiles.length,
currentFile: '',
duplicatesFound
});
}
return duplicateGroups;
}