UNPKG

greed.js

Version:

Lightweight, private alternative to Colab. Run PyTorch & NumPy in browser with GPU acceleration (8.8x speedup). Fast, secure, runs locally.

github.com/adityakhalkar/greed

adityakhalkar/greed

1,157 lines (977 loc) • 322 kB

JavaScript

/** * PyTorch Runtime Polyfill - Extracted from main thread for better performance * Provides PyTorch-compatible API with WebGPU acceleration support */ /** * Initialize PyTorch polyfill in Python runtime */ export function createPyTorchPolyfill() { return ` # WebGPU-enabled PyTorch polyfill setup import numpy as np import sys # Global gradient tracking state _grad_enabled = True def is_grad_enabled(): """Check if gradient computation is currently enabled""" global _grad_enabled return _grad_enabled def set_grad_enabled(mode): """Enable or disable gradient computation globally""" global _grad_enabled prev = _grad_enabled _grad_enabled = mode return prev class WebGPUDevice: def __init__(self, device_type, **kwargs): self.type = device_type def __str__(self): return self.type def __repr__(self): return f"device(type='{self.type}')" class WebGPUTensor: def __init__(self, data, device='cpu', dtype='float32', requires_grad=False, _force_webgpu=False, _internal=False, **kwargs): if isinstance(data, (list, tuple)): self._data = np.array(data, dtype=dtype) elif isinstance(data, np.ndarray): self._data = data.astype(dtype) else: self._data = np.array(data, dtype=dtype) # Determine actual device based on tensor size and WebGPU availability self._original_device = device self._force_webgpu = _force_webgpu # WebGPU auto-detection with recursion prevention # Only auto-detect for user-facing tensor creation (not internal operations) if device == 'webgpu' or _internal: # Explicitly requested webgpu or internal operation - use as-is self.device = device if isinstance(device, str) else device elif _force_webgpu or (device in ['cuda', 'gpu']): # Map CUDA/GPU requests to WebGPU self.device = 'webgpu' elif device == 'cpu': # Explicitly requested CPU - respect that self.device = device else: # Auto-detect for user-facing tensor creation if self._should_use_webgpu(self._data): self.device = 'webgpu' else: self.device = device if isinstance(device, str) else device self.dtype = dtype # Only enable gradient tracking if globally enabled and explicitly requested self.requires_grad = requires_grad and is_grad_enabled() self.shape = self._data.shape self.ndim = self._data.ndim self.grad = None self.grad_fn = None # GPU ACCELERATION: Allocate GPU buffer if on webgpu device self._gpu_buffer_id = None # Debug: Check each condition device_str = str(self.device) print(f"[GPU DEBUG] device={self.device}, device_str={device_str}, is_webgpu={device_str == 'webgpu'}, has_allocate={'__webgpu_allocate__' in globals()}, _internal={_internal}") if device_str == 'webgpu' and '__webgpu_allocate__' in globals() and not _internal: try: # Allocate buffer on GPU for faster operations print(f"[GPU] Calling __webgpu_allocate__ with shape={list(self.shape)}, dtype={self.dtype}") self._gpu_buffer_id = __webgpu_allocate__( self._data.tolist(), # Convert numpy to Python list for JS list(self.shape), self.dtype ) print(f"[GPU] Allocated buffer ID: {self._gpu_buffer_id}") except Exception as e: # Log error and fallback to CPU if GPU allocation fails print(f"[GPU] Allocation failed: {str(e)}") import traceback traceback.print_exc() @property def data(self): """Return data wrapped with PyTorch-like methods""" return TensorDataWrapper(self._data, self) @property def is_cpu(self): """Check if tensor is on CPU""" return self.device == 'cpu' @property def is_cuda(self): """Check if tensor is on CUDA (WebGPU in our case)""" return self.device in ['cuda', 'webgpu', 'gpu'] @property def T(self): """Transpose property - returns transposed view for 2D tensors""" if self.ndim == 2: return self.transpose(0, 1) elif self.ndim == 1: # For 1D tensors, T returns the tensor unchanged (like PyTorch) return self else: raise RuntimeError(f"T property expects a 1D or 2D tensor, but got {self.ndim}D") def is_contiguous(self): """Check if tensor is contiguous in memory. In our simplified implementation, tensors are always contiguous.""" return True def contiguous(self): """Return a contiguous tensor. In our implementation, tensors are always contiguous, so return self.""" return self def t(self): """Transpose 2D tensor (shorthand for transpose(0, 1))""" if self.ndim != 2: raise RuntimeError(f"t() expects a 2D tensor, but got {self.ndim}D") return self.transpose(0, 1) def size(self, dim=None): """Return the size of the tensor or a specific dimension""" if dim is None: return self.shape else: if dim < 0: dim = self.ndim + dim if dim >= self.ndim or dim < 0: raise IndexError(f"Dimension out of range (expected to be in range of [{-self.ndim}, {self.ndim-1}], but got {dim})") return self.shape[dim] def numel(self): """Return the total number of elements in the tensor""" return self._data.size def dim(self): """Return the number of dimensions of the tensor (method call)""" return self.ndim # For compatibility: some code might access .dim without calling it # We already have the dim() method above, but this helps with edge cases # In real PyTorch, dim is ONLY a method, never an attribute def _should_use_webgpu(self, data): \"\"\"Determine if WebGPU should be used based on tensor characteristics\"\"\" try: # Use WebGPU for tensors with more than 1000 elements for optimal performance # Smaller tensors are faster on CPU due to GPU overhead if hasattr(data, 'size'): return data.size >= 1000 elif hasattr(data, '__len__'): # For nested structures, estimate total size total_size = 1 def estimate_size(obj): if hasattr(obj, '__len__'): return len(obj) * estimate_size(obj[0] if len(obj) > 0 else 1) return 1 return estimate_size(data) >= 1000 return False except: return False def _sync_from_gpu(self): """Sync GPU data to CPU when needed (lazy sync)""" if hasattr(self, '_gpu_only') and self._gpu_only and self._gpu_buffer_id is not None: try: if '__webgpu_read__' in globals(): # Read from GPU result_data_flat = __webgpu_read__(self._gpu_buffer_id) self._data = result_data_flat.reshape(self.shape) self._gpu_only = False # Data now synced except Exception as e: print(f"[GPU] Sync from GPU failed: {str(e)}") def numpy(self): self._sync_from_gpu() # Sync if GPU-only return self._data def tolist(self): self._sync_from_gpu() # Sync if GPU-only return self._data.tolist() def __str__(self): """String representation of tensor""" self._sync_from_gpu() # Sync if GPU-only result = f"tensor({self._data.tolist()}, requires_grad={self.requires_grad})" return result def __repr__(self): """Detailed representation of tensor""" return f"WebGPUTensor({self._data}, device='{self.device}', requires_grad={self.requires_grad}, _internal=True)" def item(self): """Return the value of this tensor as a standard Python number""" if self._data.size == 1: value = self._data.item() # Ensure we return proper Python types that can be used as indices if self.dtype in ['int32', 'int64', 'long']: return int(value) elif self.dtype in ['float32', 'float64', 'double']: return float(value) else: # For other types, try to convert appropriately if isinstance(value, (int, np.integer)): return int(value) elif isinstance(value, (float, np.floating)): return float(value) else: return value else: raise ValueError("only one element tensors can be converted to Python scalars") def __format__(self, format_spec): """Support for f-string formatting""" if self._data.size == 1: return format(self._data.item(), format_spec) else: return format(str(self), format_spec) def view(self, *shape): """Reshape tensor maintaining data""" if len(shape) == 1 and isinstance(shape[0], (list, tuple)): shape = shape[0] # Handle -1 for automatic size calculation if -1 in shape: total_size = self._data.size known_size = 1 unknown_idx = -1 for i, s in enumerate(shape): if s == -1: unknown_idx = i else: known_size *= s if unknown_idx != -1: shape = list(shape) shape[unknown_idx] = total_size // known_size shape = tuple(shape) reshaped_data = self._data.reshape(shape) return WebGPUTensor(reshaped_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def reshape(self, *shape): return self.view(*shape) def transpose(self, dim0, dim1): transposed_data = np.swapaxes(self._data, dim0, dim1) return WebGPUTensor(transposed_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def t(self): """Transpose a 2D tensor (shorthand for transpose(0, 1))""" if self.ndim != 2: raise RuntimeError(f"t() expects a 2D tensor, but got {self.ndim}D tensor") return self.transpose(0, 1) def unsqueeze(self, dim): """Add a dimension of size 1""" new_shape = list(self._data.shape) if dim < 0: dim = len(new_shape) + dim + 1 new_shape.insert(dim, 1) reshaped_data = self._data.reshape(new_shape) return WebGPUTensor(reshaped_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def flatten(self, start_dim=0, end_dim=-1): """Flatten tensor dimensions""" if end_dim == -1: end_dim = self._data.ndim - 1 shape = list(self._data.shape) flattened_size = 1 for i in range(start_dim, end_dim + 1): flattened_size *= shape[i] new_shape = shape[:start_dim] + [flattened_size] + shape[end_dim + 1:] flattened_data = self._data.reshape(new_shape) return WebGPUTensor(flattened_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def squeeze(self, dim=None): """Remove dimensions of size 1""" if dim is None: # Remove all dimensions of size 1 squeezed_data = np.squeeze(self._data) else: # Remove specific dimension if it has size 1 if dim < 0: dim = self._data.ndim + dim if self._data.shape[dim] != 1: return self # No change if dimension is not size 1 squeezed_data = np.squeeze(self._data, axis=dim) return WebGPUTensor(squeezed_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def clone(self): """Create a copy of the tensor with gradient tracking preserved. The cloned tensor shares no storage with the original tensor but preserves requires_grad and creates a new node in the computation graph if applicable. """ cloned_data = self._data.copy() cloned_tensor = WebGPUTensor( cloned_data, device=self.device, dtype=self.dtype, requires_grad=self.requires_grad ) # If original tensor has gradient tracking, set up backward function for clone if self.requires_grad: def clone_backward(grad_output): # Gradient flows back to original tensor unchanged if self.grad is None: self.grad = grad_output else: self.grad._data += grad_output._data cloned_tensor._backward_fn = clone_backward cloned_tensor._inputs = [self] return cloned_tensor def detach(self): """Create a copy of the tensor that is detached from the computation graph. The detached tensor will never require gradient and breaks the gradient flow. Returns a new tensor with the same data but requires_grad=False. """ detached_data = self._data.copy() detached_tensor = WebGPUTensor( detached_data, device=self.device, dtype=self.dtype, requires_grad=False # Always False for detached tensors , _internal=True) return detached_tensor def sum(self, dim=None, keepdim=False): if dim is None: result_data = np.sum(self._data) else: result_data = np.sum(self._data, axis=dim, keepdims=keepdim) result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad) # Set up autograd for sum if result.requires_grad: result.grad_fn = 'SumBackward' result._inputs = [self] def sum_backward(grad): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Gradient of sum: broadcast the gradient back to input shape if dim is None: # Sum over all dimensions - broadcast gradient to all elements self.grad._data += grad._data * np.ones_like(self._data) else: # Sum over specific dimension - broadcast along that dimension grad_data = grad._data if hasattr(grad, '_data') else grad if not keepdim: # Need to add the dimension back for broadcasting grad_data = np.expand_dims(grad_data, axis=dim) self.grad._data += np.broadcast_to(grad_data, self._data.shape) result._backward_fn = sum_backward return result def mean(self, dim=None, keepdim=False): if dim is None: result_data = np.mean(self._data) else: result_data = np.mean(self._data, axis=dim, keepdims=keepdim) result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad) # Set up autograd for mean if result.requires_grad: result.grad_fn = 'MeanBackward' result._inputs = [self] def mean_backward(grad): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Gradient of mean: broadcast and divide by number of elements if dim is None: # Mean over all dimensions n = self._data.size self.grad._data += (grad._data / n) * np.ones_like(self._data) else: # Mean over specific dimension grad_data = grad._data if hasattr(grad, '_data') else grad if not keepdim: grad_data = np.expand_dims(grad_data, axis=dim) n = self._data.shape[dim] self.grad._data += np.broadcast_to(grad_data / n, self._data.shape) result._backward_fn = mean_backward return result def std(self, dim=None, keepdim=False, unbiased=True): """Compute standard deviation""" if dim is None: result_data = np.std(self._data, ddof=1 if unbiased else 0) else: result_data = np.std(self._data, axis=dim, keepdims=keepdim, ddof=1 if unbiased else 0) return WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def var(self, dim=None, keepdim=False, unbiased=True): """Compute variance""" if dim is None: result_data = np.var(self._data, ddof=1 if unbiased else 0) else: result_data = np.var(self._data, axis=dim, keepdims=keepdim, ddof=1 if unbiased else 0) return WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def tanh(self): """Hyperbolic tangent activation - tensor method""" result_data = np.tanh(self._data) return WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def abs(self): """Absolute value - tensor method""" result_data = np.abs(self._data) return WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def all(self): """Test if all elements are True""" result_data = np.all(self._data) return result_data def max(self, dim=None, keepdim=False): """Maximum values along a dimension""" if dim is None: result_data = np.max(self._data) return WebGPUTensor([result_data], device="webgpu", dtype=self.dtype, _internal=True) else: max_values = np.max(self._data, axis=dim, keepdims=keepdim) max_indices = np.argmax(self._data, axis=dim) if keepdim: max_indices = np.expand_dims(max_indices, axis=dim) values_tensor = WebGPUTensor(max_values, device="webgpu", dtype=self.dtype, _internal=True) indices_tensor = WebGPUTensor(max_indices, device="webgpu", dtype='int64', _internal=True) return values_tensor, indices_tensor def min(self, dim=None, keepdim=False): """Minimum values along a dimension""" if dim is None: result_data = np.min(self._data) return WebGPUTensor([result_data], device="webgpu", dtype=self.dtype, _internal=True) else: min_values = np.min(self._data, axis=dim, keepdims=keepdim) min_indices = np.argmin(self._data, axis=dim) if keepdim: min_indices = np.expand_dims(min_indices, axis=dim) values_tensor = WebGPUTensor(min_values, device="webgpu", dtype=self.dtype, _internal=True) indices_tensor = WebGPUTensor(min_indices, device="webgpu", dtype='int64', _internal=True) return values_tensor, indices_tensor def argmax(self, dim=None, keepdim=False): """Indices of maximum values along a dimension""" if dim is None: result_data = np.argmax(self._data) return WebGPUTensor([result_data], device="webgpu", dtype='int64', _internal=True) else: result_data = np.argmax(self._data, axis=dim) if keepdim: result_data = np.expand_dims(result_data, axis=dim) return WebGPUTensor(result_data, device="webgpu", dtype='int64', _internal=True) def argmin(self, dim=None, keepdim=False): """Indices of minimum values along a dimension""" if dim is None: result_data = np.argmin(self._data) return WebGPUTensor([result_data], device="webgpu", dtype='int64', _internal=True) else: result_data = np.argmin(self._data, axis=dim) if keepdim: result_data = np.expand_dims(result_data, axis=dim) return WebGPUTensor(result_data, device="webgpu", dtype='int64', _internal=True) def to(self, device): new_device = WebGPUDevice(device) if isinstance(device, str) else device # Don't use _internal=True to allow GPU buffer allocation when moving to GPU return WebGPUTensor(self._data.copy(), device=new_device, dtype=self.dtype, requires_grad=self.requires_grad) def cpu(self): return self.to('cpu') def cuda(self): return self.to('webgpu') # Map CUDA to WebGPU def float(self): """Convert tensor to float32 dtype""" return WebGPUTensor(self._data.copy(), device=self.device, dtype='float32', requires_grad=self.requires_grad, _internal=True) def double(self): """Convert tensor to float64 dtype""" return WebGPUTensor(self._data.copy(), device=self.device, dtype='float64', requires_grad=self.requires_grad, _internal=True) def int(self): """Convert tensor to int32 dtype""" return WebGPUTensor(self._data.copy(), device=self.device, dtype='int32', requires_grad=self.requires_grad, _internal=True) def long(self): """Convert tensor to int64 dtype""" return WebGPUTensor(self._data.copy(), device=self.device, dtype='int64', requires_grad=self.requires_grad, _internal=True) def type_as(self, other): """Convert this tensor to the same dtype as other tensor""" if isinstance(other, WebGPUTensor): target_dtype = other.dtype else: # If other is not a tensor, assume it's float32 target_dtype = 'float32' return WebGPUTensor(self._data.copy(), device=self.device, dtype=target_dtype, requires_grad=self.requires_grad, _internal=True) def __getitem__(self, key): """Support tensor slicing like X[:, 0] and advanced indexing""" # Handle advanced indexing with tensor indices if isinstance(key, tuple): # Convert WebGPUTensor indices to numpy arrays converted_key = [] for k in key: if isinstance(k, WebGPUTensor): # Convert tensor to numpy array for indexing converted_key.append(k._data.astype(np.int64)) else: converted_key.append(k) key = tuple(converted_key) # Multi-dimensional indexing indexed_data = self._data.reshape(self.shape)[key] elif isinstance(key, WebGPUTensor): # Single tensor index indices = key._data.astype(np.int64) indexed_data = self._data.reshape(self.shape)[indices] else: # Single dimension indexing (slice, int, etc.) indexed_data = self._data.reshape(self.shape)[key] return WebGPUTensor(indexed_data, device=self.device, dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def backward(self, gradient=None, retain_graph=False, create_graph=False): """Backward propagation through the computation graph""" if not self.requires_grad: return # Check if graph still exists if not hasattr(self, '_backward_fn') or self._backward_fn is None: if hasattr(self, 'grad_fn') and self.grad_fn is not None: raise RuntimeError("Trying to backward through the graph a second time (or directly access a leaf Variable that doesn't require grad). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved variables after calling backward.") if gradient is None: if self._data.size != 1: raise RuntimeError("grad can be implicitly created only for scalar outputs") gradient = WebGPUTensor(np.ones_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Topological sort for DAG-based backward pass visited = set() topo_order = [] def build_topo(node): if id(node) in visited or not isinstance(node, WebGPUTensor): return visited.add(id(node)) if hasattr(node, '_inputs'): for inp in node._inputs: build_topo(inp) topo_order.append(node) build_topo(self) # Initialize gradient for the output if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Handle gradient parameter properly if isinstance(gradient, WebGPUTensor): self.grad._data = gradient._data.copy() elif hasattr(gradient, '_data'): self.grad._data = gradient._data.copy() else: self.grad._data = np.array(gradient) # Backward pass in reverse topological order for node in reversed(topo_order): if hasattr(node, '_backward_fn') and node._backward_fn and node.grad is not None: # Call hooks on the gradient before propagating grad_to_propagate = node._call_hooks(node.grad) if hasattr(node, '_call_hooks') else node.grad # Pass create_graph flag to backward function if it accepts it try: node._backward_fn(grad_to_propagate, create_graph=create_graph) except TypeError: # Fallback for backward functions that don't accept create_graph node._backward_fn(grad_to_propagate) # Clean up graph if not retaining if not retain_graph: for node in topo_order: if hasattr(node, '_backward_fn'): node._backward_fn = None if hasattr(node, '_inputs'): node._inputs = [] def zero_(self): """Zero out the tensor data in-place""" self._data.fill(0) return self def retain_grad(self): """Enable gradient retention for non-leaf tensors""" self._retain_grad = True return self def register_hook(self, hook): """Register a backward hook on the tensor. The hook will be called every time a gradient with respect to the tensor is computed. The hook should have the following signature: hook(grad) -> Tensor or None Args: hook: A function that takes a gradient tensor and optionally returns a modified gradient Returns: A handle that can be used to remove the hook by calling handle.remove() """ if not hasattr(self, '_hooks'): self._hooks = [] # Store the hook self._hooks.append(hook) # Create a handle for removing the hook class HookHandle: def __init__(self, tensor, hook_fn, **kwargs): self.tensor = tensor self.hook_fn = hook_fn def remove(self): if hasattr(self.tensor, '_hooks') and self.hook_fn in self.tensor._hooks: self.tensor._hooks.remove(self.hook_fn) return HookHandle(self, hook) def _call_hooks(self, grad): """Call all registered hooks on the gradient""" if not hasattr(self, '_hooks') or not self._hooks: return grad for hook in self._hooks: new_grad = hook(grad) if new_grad is not None: grad = new_grad return grad def __repr__(self): return f"tensor({self._data}, device='{self.device}', dtype='{self.dtype}')" def __float__(self): """Convert single-element tensor to Python float""" if self._data.size == 1: return float(self._data.item()) else: raise TypeError(f"only single-element tensors can be converted to Python scalars") def __int__(self): """Convert single-element tensor to Python int""" if self._data.size == 1: return int(self._data.item()) else: raise TypeError(f"only single-element tensors can be converted to Python scalars") def __len__(self): """Return the length of the first dimension""" if self.ndim == 0: raise TypeError("len() of unsized object") return self.shape[0] def __getitem__(self, key): """Support tensor indexing like tensor[indices]""" if isinstance(key, WebGPUTensor): # Convert WebGPUTensor indices to numpy array indices = key._data.astype(int) result_data = self._data[indices] # In PyTorch, indexing with tensor indices always preserves at least 1 dimension # even when the index tensor has 1 element if result_data.ndim == 0: result_data = np.array([result_data]) else: result_data = self._data[key] return WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad, _internal=True) def __setitem__(self, key, value): """Support tensor item assignment like tensor[indices] = value""" if isinstance(value, WebGPUTensor): value_data = value._data else: value_data = value if isinstance(key, WebGPUTensor): # Convert WebGPUTensor indices to numpy array indices = key._data.astype(int) self._data[indices] = value_data else: self._data[key] = value_data def eq(self, other): """Element-wise equality comparison (returns tensor)""" if isinstance(other, WebGPUTensor): result_data = self._data == other._data else: result_data = self._data == other return WebGPUTensor(result_data, device="webgpu", dtype='bool', _internal=True) def __eq__(self, other): """Element-wise equality comparison (returns tensor like PyTorch)""" if isinstance(other, WebGPUTensor): result_data = self._data == other._data else: result_data = self._data == other return WebGPUTensor(result_data, device="webgpu", dtype='bool', _internal=True) def __ne__(self, other): """Element-wise not-equal comparison""" if isinstance(other, WebGPUTensor): result_data = self._data != other._data else: result_data = self._data != other return WebGPUTensor(result_data, device="webgpu", dtype='bool', _internal=True) def __gt__(self, other): """Element-wise greater than comparison""" if isinstance(other, WebGPUTensor): result_data = self._data > other._data else: result_data = self._data > other return WebGPUTensor(result_data, device="webgpu", dtype='bool', _internal=True) def __lt__(self, other): """Element-wise less than comparison""" if isinstance(other, WebGPUTensor): result_data = self._data < other._data else: result_data = self._data < other return WebGPUTensor(result_data, device="webgpu", dtype='bool', _internal=True) def __ge__(self, other): """Element-wise greater than or equal comparison""" if isinstance(other, WebGPUTensor): result_data = self._data >= other._data else: result_data = self._data >= other return WebGPUTensor(result_data, device="webgpu", dtype='bool', _internal=True) def __le__(self, other): """Element-wise less than or equal comparison""" if isinstance(other, WebGPUTensor): result_data = self._data <= other._data else: result_data = self._data <= other return WebGPUTensor(result_data, device="webgpu", dtype='bool', _internal=True) def __hash__(self): """Make tensor hashable for use as dictionary keys. Uses object identity (id) so each tensor instance is unique.""" return id(self) # Masked operations def masked_fill(self, mask, value): """Fill elements of self tensor with value where mask is True. Args: mask: Boolean tensor with same shape as self value: Value to fill Returns: New tensor with masked elements filled """ if isinstance(mask, WebGPUTensor): mask_data = mask._data else: mask_data = np.array(mask) # Ensure mask is boolean type mask_data = mask_data.astype(bool) # Create a copy of the data result_data = self._data.copy() # Fill masked positions result_data[mask_data] = value result = WebGPUTensor( result_data, device=self.device, dtype=self.dtype, requires_grad=self.requires_grad ) # Set up backward function if gradient tracking is enabled if self.requires_grad: def masked_fill_backward(grad_output): # Gradient flows through non-masked elements only grad_input = grad_output._data.copy() grad_input[mask_data] = 0 # Zero out gradients for masked positions grad_tensor = WebGPUTensor(grad_input, device="webgpu", dtype=self.dtype, _internal=True) if self.grad is None: self.grad = grad_tensor else: self.grad._data += grad_tensor._data result._backward_fn = masked_fill_backward result._inputs = [self] return result def masked_fill_(self, mask, value): """In-place version of masked_fill""" if isinstance(mask, WebGPUTensor): mask_data = mask._data else: mask_data = np.array(mask) # Ensure mask is boolean type mask_data = mask_data.astype(bool) self._data[mask_data] = value return self # Arithmetic operators def __add__(self, other): if isinstance(other, WebGPUTensor): result_data = self._data + other._data else: result_data = self._data + other result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad or (isinstance(other, WebGPUTensor) and other.requires_grad)) # Set up autograd if result.requires_grad: result.grad_fn = 'AddBackward' result._inputs = [] if self.requires_grad: result._inputs.append(self) if isinstance(other, WebGPUTensor) and other.requires_grad: result._inputs.append(other) def add_backward(grad, create_graph=False): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Handle broadcasting: reduce gradient to match original shape grad_data = grad._data # Sum out added dims and reduce broadcast dims ndims_added = grad_data.ndim - self._data.ndim for i in range(ndims_added): grad_data = grad_data.sum(axis=0) # Reduce dimensions that were broadcast for i in range(grad_data.ndim): if self._data.shape[i] == 1 and grad_data.shape[i] > 1: grad_data = np.sum(grad_data, axis=i, keepdims=True) self.grad._data += grad_data if create_graph: self.grad.requires_grad = True self.grad.grad_fn = 'AddBackwardBackward' if isinstance(other, WebGPUTensor) and other.requires_grad: if other.grad is None: other.grad = WebGPUTensor(np.zeros_like(other._data), device=other.device, dtype=other.dtype, _internal=True) # Handle broadcasting for other grad_data = grad._data ndims_added = grad_data.ndim - other._data.ndim for i in range(ndims_added): grad_data = grad_data.sum(axis=0) for i in range(grad_data.ndim): if other._data.shape[i] == 1 and grad_data.shape[i] > 1: grad_data = np.sum(grad_data, axis=i, keepdims=True) other.grad._data += grad_data if create_graph: other.grad.requires_grad = True result._backward_fn = add_backward return result def __sub__(self, other): if isinstance(other, WebGPUTensor): result_data = self._data - other._data else: result_data = self._data - other result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad or (isinstance(other, WebGPUTensor) and other.requires_grad)) # Set up autograd if result.requires_grad: result.grad_fn = 'SubBackward' result._inputs = [] if self.requires_grad: result._inputs.append(self) if isinstance(other, WebGPUTensor) and other.requires_grad: result._inputs.append(other) def sub_backward(grad): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Gradient w.r.t. self: grad (unchanged) but handle broadcasting grad_data = grad._data ndims_added = grad_data.ndim - self._data.ndim for i in range(ndims_added): grad_data = grad_data.sum(axis=0) for i in range(grad_data.ndim): if self._data.shape[i] == 1 and grad_data.shape[i] > 1: grad_data = np.sum(grad_data, axis=i, keepdims=True) self.grad._data += grad_data if isinstance(other, WebGPUTensor) and other.requires_grad: if other.grad is None: other.grad = WebGPUTensor(np.zeros_like(other._data), device=other.device, dtype=other.dtype, _internal=True) # Gradient w.r.t. other: -grad (negated) and handle broadcasting grad_data = grad._data ndims_added = grad_data.ndim - other._data.ndim for i in range(ndims_added): grad_data = grad_data.sum(axis=0) for i in range(grad_data.ndim): if other._data.shape[i] == 1 and grad_data.shape[i] > 1: grad_data = np.sum(grad_data, axis=i, keepdims=True) other.grad._data -= grad_data result._backward_fn = sub_backward return result def __mul__(self, other): if isinstance(other, WebGPUTensor): result_data = self._data * other._data else: result_data = self._data * other result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad or (isinstance(other, WebGPUTensor) and other.requires_grad)) # Set up autograd if result.requires_grad: result.grad_fn = 'MulBackward' result._inputs = [] if self.requires_grad: result._inputs.append(self) if isinstance(other, WebGPUTensor) and other.requires_grad: result._inputs.append(other) def mul_backward(grad, create_graph=False): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Gradient: grad * other if create_graph: grad_tensor = grad if isinstance(grad, WebGPUTensor) else WebGPUTensor(grad._data, device=self.device, dtype=self.dtype, _internal=True) if isinstance(other, WebGPUTensor): other_tensor = WebGPUTensor(other._data, device=self.device, dtype=self.dtype, requires_grad=True, _internal=True) grad_self = grad_tensor * other_tensor else: grad_self = grad_tensor * other self.grad._data += grad_self._data self.grad.requires_grad = True self.grad.grad_fn = 'MulBackwardBackward' else: if isinstance(other, WebGPUTensor): self.grad._data += grad._data * other._data else: self.grad._data += grad._data * other if isinstance(other, WebGPUTensor) and other.requires_grad: if other.grad is None: other.grad = WebGPUTensor(np.zeros_like(other._data), device=other.device, dtype=other.dtype, _internal=True) # Gradient: grad * self if create_graph: grad_tensor = grad if isinstance(grad, WebGPUTensor) else WebGPUTensor(grad._data, device=self.device, dtype=self.dtype, _internal=True) self_tensor = WebGPUTensor(self._data, device=self.device, dtype=self.dtype, requires_grad=True, _internal=True) grad_other = grad_tensor * self_tensor other.grad._data += grad_other._data other.grad.requires_grad = True else: other.grad._data += grad._data * self._data result._backward_fn = mul_backward return result def __truediv__(self, other): if isinstance(other, WebGPUTensor): result_data = self._data / other._data else: result_data = self._data / other result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad or (isinstance(other, WebGPUTensor) and other.requires_grad)) # Set up autograd if result.requires_grad: result.grad_fn = 'DivBackward' result._inputs = [] if self.requires_grad: result._inputs.append(self) if isinstance(other, WebGPUTensor) and other.requires_grad: result._inputs.append(other) def div_backward(grad): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Gradient w.r.t. self: grad / other if isinstance(other, WebGPUTensor): self.grad._data += grad._data / other._data else: self.grad._data += grad._data / other if isinstance(other, WebGPUTensor) and other.requires_grad: if other.grad is None: other.grad = WebGPUTensor(np.zeros_like(other._data), device=other.device, dtype=other.dtype, _internal=True) # Gradient w.r.t. other: -grad * self / other^2 other.grad._data -= grad._data * self._data / (other._data ** 2) result._backward_fn = div_backward return result def __pow__(self, other): if isinstance(other, WebGPUTensor): result_data = np.power(self._data, other._data) else: result_data = np.power(self._data, other) result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad or (isinstance(other, WebGPUTensor) and other.requires_grad)) # Set up autograd if result.requires_grad: result.grad_fn = 'PowBackward' result._inputs = [] if self.requires_grad: result._inputs.append(self) if isinstance(other, WebGPUTensor) and other.requires_grad: result._inputs.append(other) def pow_backward(grad, create_graph=False): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Gradient w.r.t. base: grad * exponent * base^(exponent-1) if create_graph: # Use differentiable operations for higher-order gradients grad_tensor = grad if isinstance(grad, WebGPUTensor) else WebGPUTensor(grad._data, device=self.device, dtype=self.dtype, _internal=True) base_tensor = WebGPUTensor(self._data, device=self.device, dtype=self.dtype, requires_grad=True, _internal=True) if isinstance(other, WebGPUTensor): exponent_tensor = other else: exponent_tensor = WebGPUTensor(np.array(other), device=self.device, dtype=self.dtype) # grad * exponent * base^(exponent-1) grad_base = grad_tensor * exponent_tensor * (base_tensor ** (exponent_tensor - 1)) self.grad._data += grad_base._data self.grad.requires_grad = True self.grad.grad_fn = 'PowBackwardBackward' else: # Use NumPy for efficiency when not creating graph if isinstance(other, WebGPUTensor): self.grad._data += grad._data * other._data * np.power(self._data, other._data - 1) else: self.grad._data += grad._data * other * np.power(self._data, other - 1) if isinstance(other, WebGPUTensor) and other.requires_grad: if other.grad is None: other.grad = WebGPUTensor(np.zeros_like(other._data), device=other.device, dtype=other.dtype, _internal=True) # Gradient w.r.t. exponent: grad * log(base) * base^exponent if create_graph: grad_tensor = grad if isinstance(grad, WebGPUTensor) else WebGPUTensor(grad._data, device=self.device, dtype=self.dtype, _internal=True) base_tensor = WebGPUTensor(self._data, device=self.device, dtype=self.dtype, _internal=True) result_tensor = WebGPUTensor(result_data, device=self.device, dtype=self.dtype) # grad * log(base) * base^exponent grad_exp = grad_tensor * WebGPUTensor(np.log(self._data), device=self.device, dtype=self.dtype) * result_tensor other.grad._data += grad_exp._data other.grad.requires_grad = True else: other.grad._data += grad._data * np.log(self._data) * result_data result._backward_fn = pow_backward return result def __neg__(self): """Unary negation operator (-tensor)""" result_data = -self._data result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad) # Set up autograd if result.requires_grad: result.grad_fn = 'NegBackward' result._inputs = [self] def neg_backward(grad): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dtype=self.dtype, _internal=True) # Gradient of negation: -grad self.grad._data -= grad._data result._backward_fn = neg_backward return result def __pos__(self): """Unary positive operator (+tensor)""" result_data = +self._data result = WebGPUTensor(result_data, device="webgpu", dtype=self.dtype, requires_grad=self.requires_grad) # Set up autograd if result.requires_grad: result.grad_fn = 'PosBackward' result._inputs = [self] def pos_backward(grad): if self.requires_grad: if self.grad is None: self.grad = WebGPUTensor(np.zeros_like(self._data), device=self.device, dt