fp16
Version:
Half-precision 16-bit floating point numbers
117 lines (116 loc) • 3.81 kB
JavaScript
import { float16Precision, float16Emin, float16Emax, float32Precision, float32Emin, float32Emax, } from "./utils.js";
const float64Buffer = new ArrayBuffer(8);
const float64View = new DataView(float64Buffer);
export const Precision = {
Exact: 0,
Inexact: 1,
Underflow: 2,
Overflow: 3,
};
/**
*
* @param value a float64 value
* @returns a Precision enum indicating whether conversion to float16 will overflow, underflow, round inexactly, or preserve the exact value
*/
export function getFloat16Precision(value) {
if (isNaN(value) || value === 0) {
return Precision.Exact;
}
else if (value === Infinity || value === -Infinity) {
return Precision.Exact;
}
float64View.setFloat64(0, value);
const a = float64View.getInt32(0);
const b = float64View.getInt32(4);
const exponent = a & 0x7ff00000;
const exponentValue = (exponent >>> 20) - 1023;
if (float16Emax < exponentValue) {
return Precision.Overflow;
}
else if (exponentValue < float16Emin - float16Precision) {
return Precision.Underflow;
}
const significantBits = getSignificantBits(a & 0x000fffff, b);
if (float16Precision < significantBits) {
return Precision.Inexact;
}
if (exponentValue < float16Emin) {
if (float16Emin - exponentValue <= float16Precision - significantBits) {
// in this case the value can be encoded losslessly as a subnormal number
return Precision.Exact;
}
else {
return Precision.Inexact;
}
}
else {
return Precision.Exact;
}
}
/**
*
* @param value a float64 value
* @returns a Precision enum indicating whether conversion to float32 will overflow, underflow, round inexactly, or preserve the exact value
*/
export function getFloat32Precision(value) {
if (isNaN(value) || value === 0) {
return Precision.Exact;
}
else if (value === Infinity || value === -Infinity) {
return Precision.Exact;
}
float64View.setFloat64(0, value);
const a = float64View.getInt32(0);
const b = float64View.getInt32(4);
const exponent = a & 0x7ff00000;
const exponentValue = (exponent >>> 20) - 1023;
if (float32Emax < exponentValue) {
return Precision.Overflow;
}
else if (exponentValue < float32Emin - float32Precision) {
return Precision.Underflow;
}
const significantBits = getSignificantBits(a & 0x000fffff, b);
if (float32Precision < significantBits) {
return Precision.Inexact;
}
if (exponentValue < float32Emin) {
if (float32Emin - exponentValue <= float32Precision - significantBits) {
// in this case the value can be encoded losslessly as a subnormal number
return Precision.Exact;
}
else {
return Precision.Inexact;
}
}
else {
return Precision.Exact;
}
}
/**
*
* @param a the first 20 bits of the mantissa
* @param b the last 32 bits of the mantissa
* @returns the index of the rightmost 1 bit
*/
function getSignificantBits(a, b) {
if (b) {
// if b is non-zero, then the total number of significant bits
// is the index of the last significant bit in b plus 20
let offsetB = 0;
for (let shiftedB = b; shiftedB; shiftedB <<= 1) {
offsetB++;
}
return offsetB + 20;
}
else {
// otherwise if b is zero, then the total number of significant bits
// is the index, relative to 12, of the last significant bit in a
let offsetA = 0;
// shift the lower 20 bits of a to the left end of an int32
for (let shiftedA = a << 12; shiftedA; shiftedA <<= 1) {
offsetA++;
}
return offsetA;
}
}