transformers-fork
Version:
State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!
1,090 lines (919 loc) • 44.7 kB
JavaScript
import { Callable } from "../utils/generic.js";
import { Tensor, interpolate, stack } from "../utils/tensor.js";
import { bankers_round, max, min, softmax } from "../utils/maths.js";
import { RawImage } from "../utils/image.js";
import { calculateReflectOffset } from "../utils/core.js";
import { getModelJSON } from "../utils/hub.js";
import { IMAGE_PROCESSOR_NAME } from '../utils/constants.js';
/**
* Named tuple to indicate the order we are using is (height x width),
* even though the Graphics' industry standard is (width x height).
* @typedef {[height: number, width: number]} HeightWidth
*/
/**
* @typedef {object} ImageProcessorResult
* @property {Tensor} pixel_values The pixel values of the batched preprocessed images.
* @property {HeightWidth[]} original_sizes Array of two-dimensional tuples like [[480, 640]].
* @property {HeightWidth[]} reshaped_input_sizes Array of two-dimensional tuples like [[1000, 1330]].
*/
/**
* Helper function to constrain a value to be a multiple of a number.
* @param {number} val The value to constrain.
* @param {number} multiple The number to constrain to.
* @param {number} [minVal=0] The minimum value to constrain to.
* @param {number} [maxVal=null] The maximum value to constrain to.
* @returns {number} The constrained value.
* @private
*/
function constraint_to_multiple_of(val, multiple, minVal = 0, maxVal = null) {
const a = val / multiple;
let x = bankers_round(a) * multiple;
if (maxVal !== null && x > maxVal) {
x = Math.floor(a) * multiple;
}
if (x < minVal) {
x = Math.ceil(a) * multiple;
}
return x;
}
/**
* Rounds the height and width down to the closest multiple of size_divisibility
* @param {[number, number]} size The size of the image
* @param {number} divisor The divisor to use.
* @returns {[number, number]} The rounded size.
*/
function enforce_size_divisibility([width, height], divisor) {
return [
Math.max(Math.floor(width / divisor), 1) * divisor,
Math.max(Math.floor(height / divisor), 1) * divisor
];
}
// Helper functions
/**
* Converts bounding boxes from center format to corners format.
*
* @param {number[]} arr The coordinate for the center of the box and its width, height dimensions (center_x, center_y, width, height)
* @returns {number[]} The coodinates for the top-left and bottom-right corners of the box (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
*/
function center_to_corners_format([centerX, centerY, width, height]) {
return [
centerX - width / 2,
centerY - height / 2,
centerX + width / 2,
centerY + height / 2
];
}
/**
* Post-processes the outputs of the model (for object detection).
* @param {Object} outputs The outputs of the model that must be post-processed
* @param {Tensor} outputs.logits The logits
* @param {Tensor} outputs.pred_boxes The predicted boxes.
* @param {number} [threshold=0.5] The threshold to use for the scores.
* @param {[number, number][]} [target_sizes=null] The sizes of the original images.
* @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed.
* @return {Object[]} An array of objects containing the post-processed outputs.
*/
export function post_process_object_detection(outputs, threshold = 0.5, target_sizes = null, is_zero_shot = false) {
const out_logits = outputs.logits;
const out_bbox = outputs.pred_boxes;
const [batch_size, num_boxes, num_classes] = out_logits.dims;
if (target_sizes !== null && target_sizes.length !== batch_size) {
throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
}
let toReturn = [];
for (let i = 0; i < batch_size; ++i) {
let target_size = target_sizes !== null ? target_sizes[i] : null;
let info = {
boxes: [],
classes: [],
scores: []
}
let logits = out_logits[i];
let bbox = out_bbox[i];
for (let j = 0; j < num_boxes; ++j) {
let logit = logits[j];
let indices = [];
let probs;
if (is_zero_shot) {
// Get indices of classes with high enough probability
probs = logit.sigmoid().data;
for (let k = 0; k < probs.length; ++k) {
if (probs[k] > threshold) {
indices.push(k);
}
}
} else {
// Get most probable class
let maxIndex = max(logit.data)[1];
if (maxIndex === num_classes - 1) {
// This is the background class, skip it
continue;
}
// Compute softmax over classes
probs = softmax(logit.data);
if (probs[maxIndex] < threshold) {
continue;
}
indices.push(maxIndex);
}
for (const index of indices) {
// Some class has a high enough probability
/** @type {number[]} */
let box = bbox[j].data;
// convert to [x0, y0, x1, y1] format
box = center_to_corners_format(box)
if (target_size !== null) {
box = box.map((x, i) => x * target_size[(i + 1) % 2])
}
info.boxes.push(box);
info.classes.push(index);
info.scores.push(probs[index]);
}
}
toReturn.push(info);
}
return toReturn;
}
/**
* Post-processes the outputs of the model (for semantic segmentation).
* @param {*} outputs Raw outputs of the model.
* @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
* (height, width) of each prediction. If unset, predictions will not be resized.
* @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps.
*/
export function post_process_semantic_segmentation(outputs, target_sizes = null) {
const logits = outputs.logits;
const batch_size = logits.dims[0];
if (target_sizes !== null && target_sizes.length !== batch_size) {
throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
}
const toReturn = [];
for (let i = 0; i < batch_size; ++i) {
const target_size = target_sizes !== null ? target_sizes[i] : null;
let data = logits[i];
// 1. If target_size is not null, we need to resize the masks to the target size
if (target_size !== null) {
// resize the masks to the target size
data = interpolate(data, target_size, 'bilinear', false);
}
const [height, width] = target_size ?? data.dims.slice(-2);
const segmentation = new Tensor(
'int32',
new Int32Array(height * width),
[height, width]
);
// Buffer to store current largest value
const buffer = data[0].data;
const segmentation_data = segmentation.data;
for (let j = 1; j < data.dims[0]; ++j) {
const row = data[j].data;
for (let k = 0; k < row.length; ++k) {
if (row[k] > buffer[k]) {
buffer[k] = row[k];
segmentation_data[k] = j;
}
}
}
// Store which objects have labels
// This is much more efficient that creating a set of the final values
const hasLabel = new Array(data.dims[0]);
for (let j = 0; j < segmentation_data.length; ++j) {
const index = segmentation_data[j];
hasLabel[index] = index;
}
/** @type {number[]} The unique list of labels that were detected */
const labels = hasLabel.filter(x => x !== undefined);
toReturn.push({ segmentation, labels });
}
return toReturn;
}
/**
* Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`.
* @param {Tensor} class_logits The class logits.
* @param {Tensor} mask_logits The mask logits.
* @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks.
* @param {number} num_labels The number of labels.
* @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels.
* @private
*/
function remove_low_and_no_objects(class_logits, mask_logits, object_mask_threshold, num_labels) {
const mask_probs_item = [];
const pred_scores_item = [];
const pred_labels_item = [];
for (let j = 0; j < class_logits.dims[0]; ++j) {
const cls = class_logits[j];
const mask = mask_logits[j];
const pred_label = max(cls.data)[1];
if (pred_label === num_labels) {
// Is the background, so we ignore it
continue;
}
const scores = softmax(cls.data);
const pred_score = scores[pred_label];
if (pred_score > object_mask_threshold) {
mask_probs_item.push(mask);
pred_scores_item.push(pred_score);
pred_labels_item.push(pred_label);
}
}
return [mask_probs_item, pred_scores_item, pred_labels_item];
}
/**
* Checks whether the segment is valid or not.
* @param {Int32Array} mask_labels Labels for each pixel in the mask.
* @param {Tensor[]} mask_probs Probabilities for each pixel in the masks.
* @param {number} k The class id of the segment.
* @param {number} mask_threshold The mask threshold.
* @param {number} overlap_mask_area_threshold The overlap mask area threshold.
* @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels.
* @private
*/
function check_segment_validity(
mask_labels,
mask_probs,
k,
mask_threshold = 0.5,
overlap_mask_area_threshold = 0.8
) {
// mask_k is a 1D array of indices, indicating where the mask is equal to k
const mask_k = [];
let mask_k_area = 0;
let original_area = 0;
const mask_probs_k_data = mask_probs[k].data;
// Compute the area of all the stuff in query k
for (let i = 0; i < mask_labels.length; ++i) {
if (mask_labels[i] === k) {
mask_k.push(i);
++mask_k_area;
}
if (mask_probs_k_data[i] >= mask_threshold) {
++original_area;
}
}
let mask_exists = mask_k_area > 0 && original_area > 0;
// Eliminate disconnected tiny segments
if (mask_exists) {
// Perform additional check
let area_ratio = mask_k_area / original_area;
mask_exists = area_ratio > overlap_mask_area_threshold;
}
return [mask_exists, mask_k]
}
/**
* Computes the segments.
* @param {Tensor[]} mask_probs The mask probabilities.
* @param {number[]} pred_scores The predicted scores.
* @param {number[]} pred_labels The predicted labels.
* @param {number} mask_threshold The mask threshold.
* @param {number} overlap_mask_area_threshold The overlap mask area threshold.
* @param {Set<number>} label_ids_to_fuse The label ids to fuse.
* @param {number[]} target_size The target size of the image.
* @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments.
* @private
*/
function compute_segments(
mask_probs,
pred_scores,
pred_labels,
mask_threshold,
overlap_mask_area_threshold,
label_ids_to_fuse = null,
target_size = null,
) {
const [height, width] = target_size ?? mask_probs[0].dims;
const segmentation = new Tensor(
'int32',
new Int32Array(height * width),
[height, width]
);
const segments = [];
// 1. If target_size is not null, we need to resize the masks to the target size
if (target_size !== null) {
// resize the masks to the target size
for (let i = 0; i < mask_probs.length; ++i) {
mask_probs[i] = interpolate(mask_probs[i], target_size, 'bilinear', false);
}
}
// 2. Weigh each mask by its prediction score
// NOTE: `mask_probs` is updated in-place
//
// Temporary storage for the best label/scores for each pixel ([height, width]):
const mask_labels = new Int32Array(mask_probs[0].data.length);
const bestScores = new Float32Array(mask_probs[0].data.length);
for (let i = 0; i < mask_probs.length; ++i) {
let score = pred_scores[i];
const mask_probs_i_data = mask_probs[i].data;
for (let j = 0; j < mask_probs_i_data.length; ++j) {
mask_probs_i_data[j] *= score
if (mask_probs_i_data[j] > bestScores[j]) {
mask_labels[j] = i;
bestScores[j] = mask_probs_i_data[j];
}
}
}
let current_segment_id = 0;
// let stuff_memory_list = {}
const segmentation_data = segmentation.data;
for (let k = 0; k < pred_labels.length; ++k) {
const pred_class = pred_labels[k];
// TODO add `should_fuse`
// let should_fuse = pred_class in label_ids_to_fuse
// Check if mask exists and large enough to be a segment
const [mask_exists, mask_k] = check_segment_validity(
mask_labels,
mask_probs,
k,
mask_threshold,
overlap_mask_area_threshold
)
if (!mask_exists) {
// Nothing to see here
continue;
}
// TODO
// if (pred_class in stuff_memory_list) {
// current_segment_id = stuff_memory_list[pred_class]
// } else {
// current_segment_id += 1;
// }
++current_segment_id;
// Add current object segment to final segmentation map
for (const index of mask_k) {
segmentation_data[index] = current_segment_id;
}
segments.push({
id: current_segment_id,
label_id: pred_class,
// was_fused: should_fuse, TODO
score: pred_scores[k],
})
// TODO
// if(should_fuse){
// stuff_memory_list[pred_class] = current_segment_id
// }
}
return [segmentation, segments];
}
/**
* Rescales the image so that the following conditions are met:
*
* 1. Both dimensions (height and width) are divisible by 'factor'.
* 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
* 3. The aspect ratio of the image is maintained as closely as possible.
*
* @param {number} height The height of the image.
* @param {number} width The width of the image.
* @param {number} [factor=28] The factor to use for resizing.
* @param {number} [min_pixels=56*56] The minimum number of pixels.
* @param {number} [max_pixels=14*14*4*1280] The maximum number of pixels.
* @returns {[number, number]} The new height and width of the image.
* @throws {Error} If the height or width is smaller than the factor.
*/
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
if (height < factor || width < factor) {
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
throw new Error(
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
);
}
let h_bar = Math.round(height / factor) * factor;
let w_bar = Math.round(width / factor) * factor;
if (h_bar * w_bar > max_pixels) {
const beta = Math.sqrt((height * width) / max_pixels);
h_bar = Math.floor((height / beta) / factor) * factor;
w_bar = Math.floor((width / beta) / factor) * factor;
} else if (h_bar * w_bar < min_pixels) {
const beta = Math.sqrt(min_pixels / (height * width));
h_bar = Math.ceil((height * beta) / factor) * factor;
w_bar = Math.ceil((width * beta) / factor) * factor;
}
return [h_bar, w_bar];
}
/**
* Post-process the model output to generate the final panoptic segmentation.
* @param {*} outputs The model output to post process
* @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
* @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
* @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.
* @param {Set<number>} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together.
* @param {[number, number][]} [target_sizes=null] The target sizes to resize the masks to.
* @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
*/
export function post_process_panoptic_segmentation(
outputs,
threshold = 0.5,
mask_threshold = 0.5,
overlap_mask_area_threshold = 0.8,
label_ids_to_fuse = null,
target_sizes = null,
) {
if (label_ids_to_fuse === null) {
console.warn("`label_ids_to_fuse` unset. No instance will be fused.")
label_ids_to_fuse = new Set();
}
const class_queries_logits = outputs.class_queries_logits ?? outputs.logits; // [batch_size, num_queries, num_classes+1]
const masks_queries_logits = outputs.masks_queries_logits ?? outputs.pred_masks; // [batch_size, num_queries, height, width]
const mask_probs = masks_queries_logits.sigmoid() // [batch_size, num_queries, height, width]
let [batch_size, num_queries, num_labels] = class_queries_logits.dims;
num_labels -= 1; // Remove last class (background)
if (target_sizes !== null && target_sizes.length !== batch_size) {
throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
}
let toReturn = [];
for (let i = 0; i < batch_size; ++i) {
let target_size = target_sizes !== null ? target_sizes[i] : null;
let class_logits = class_queries_logits[i];
let mask_logits = mask_probs[i];
let [mask_probs_item, pred_scores_item, pred_labels_item] = remove_low_and_no_objects(class_logits, mask_logits, threshold, num_labels);
if (pred_labels_item.length === 0) {
// No mask found
let [height, width] = target_size ?? mask_logits.dims.slice(-2);
let segmentation = new Tensor(
'int32',
new Int32Array(height * width).fill(-1),
[height, width]
)
toReturn.push({
segmentation: segmentation,
segments_info: []
});
continue;
}
// Get segmentation map and segment information of batch item
let [segmentation, segments] = compute_segments(
mask_probs_item,
pred_scores_item,
pred_labels_item,
mask_threshold,
overlap_mask_area_threshold,
label_ids_to_fuse,
target_size,
)
toReturn.push({
segmentation: segmentation,
segments_info: segments
})
}
return toReturn;
}
/**
* Post-processes the outputs of the model (for instance segmentation).
* @param {*} outputs Raw outputs of the model.
* @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
* @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
* (height, width) of each prediction. If unset, predictions will not be resized.
* @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
*/
export function post_process_instance_segmentation(outputs, threshold = 0.5, target_sizes = null) {
throw new Error('`post_process_instance_segmentation` is not yet implemented.');
}
/**
* @typedef {Object} ImageProcessorConfig A configuration object used to create an image processor.
* @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
* @property {number[]} [image_mean] The mean values for image normalization.
* @property {number[]} [image_std] The standard deviation values for image normalization.
* @property {boolean} [do_rescale] Whether to rescale the image pixel values to the [0,1] range.
* @property {number} [rescale_factor] The factor to use for rescaling the image pixel values.
* @property {boolean} [do_normalize] Whether to normalize the image pixel values.
* @property {boolean} [do_resize] Whether to resize the image.
* @property {number} [resample] What method to use for resampling.
* @property {number|Object} [size] The size to resize the image to.
* @property {number|Object} [image_size] The size to resize the image to (same as `size`).
* @property {boolean} [do_flip_channel_order=false] Whether to flip the color channels from RGB to BGR.
* Can be overridden by the `do_flip_channel_order` parameter in the `preprocess` method.
* @property {boolean} [do_center_crop] Whether to center crop the image to the specified `crop_size`.
* Can be overridden by `do_center_crop` in the `preprocess` method.
* @property {boolean} [do_thumbnail] Whether to resize the image using thumbnail method.
* @property {boolean} [keep_aspect_ratio] If `true`, the image is resized to the largest possible size such that the aspect ratio is preserved.
* Can be overidden by `keep_aspect_ratio` in `preprocess`.
* @property {number} [ensure_multiple_of] If `do_resize` is `true`, the image is resized to a size that is a multiple of this value.
* Can be overidden by `ensure_multiple_of` in `preprocess`.
*
* @property {number[]} [mean] The mean values for image normalization (same as `image_mean`).
* @property {number[]} [std] The standard deviation values for image normalization (same as `image_std`).
*/
export class ImageProcessor extends Callable {
/**
* Constructs a new `ImageProcessor`.
* @param {ImageProcessorConfig} config The configuration object.
*/
constructor(config) {
super();
this.image_mean = config.image_mean ?? config.mean;
this.image_std = config.image_std ?? config.std;
this.resample = config.resample ?? 2; // 2 => bilinear
this.do_rescale = config.do_rescale ?? true;
this.rescale_factor = config.rescale_factor ?? (1 / 255);
this.do_normalize = config.do_normalize;
this.do_thumbnail = config.do_thumbnail;
this.size = config.size ?? config.image_size;
this.do_resize = config.do_resize ?? (this.size !== undefined);
this.size_divisibility = config.size_divisibility ?? config.size_divisor;
this.do_center_crop = config.do_center_crop;
this.crop_size = config.crop_size;
this.do_convert_rgb = config.do_convert_rgb ?? true;
this.do_crop_margin = config.do_crop_margin;
this.pad_size = config.pad_size;
this.do_pad = config.do_pad;
if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
// Should pad, but no pad size specified
// We infer the pad size from the resize size
this.pad_size = this.size
}
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
this.config = config;
}
/**
* Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
* corresponding dimension of the specified size.
* @param {RawImage} image The image to be resized.
* @param {{height:number, width:number}} size The size `{"height": h, "width": w}` to resize the image to.
* @param {string | 0 | 1 | 2 | 3 | 4 | 5} [resample=2] The resampling filter to use.
* @returns {Promise<RawImage>} The resized image.
*/
async thumbnail(image, size, resample = 2) {
const input_height = image.height;
const input_width = image.width;
const output_height = size.height;
const output_width = size.width;
// We always resize to the smallest of either the input or output size.
let height = Math.min(input_height, output_height)
let width = Math.min(input_width, output_width)
if (height === input_height && width === input_width) {
return image;
}
if (input_height > input_width) {
width = Math.floor(input_width * height / input_height);
} else if (input_width > input_height) {
height = Math.floor(input_height * width / input_width);
}
return await image.resize(width, height, { resample });
}
/**
* Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold).
* @param {RawImage} image The image to be cropped.
* @param {number} gray_threshold Value below which pixels are considered to be gray.
* @returns {Promise<RawImage>} The cropped image.
*/
async crop_margin(image, gray_threshold = 200) {
const gray_image = image.clone().grayscale();
const minValue = min(gray_image.data)[0];
const maxValue = max(gray_image.data)[0];
const diff = maxValue - minValue;
if (diff === 0) {
return image;
}
const threshold = gray_threshold / 255;
let x_min = gray_image.width, y_min = gray_image.height, x_max = 0, y_max = 0;
const gray_image_data = gray_image.data;
for (let j = 0; j < gray_image.height; ++j) {
const row = j * gray_image.width;
for (let i = 0; i < gray_image.width; ++i) {
if ((gray_image_data[row + i] - minValue) / diff < threshold) {
// We have a non-zero pixel, so we update the min/max values accordingly
x_min = Math.min(x_min, i);
y_min = Math.min(y_min, j);
x_max = Math.max(x_max, i);
y_max = Math.max(y_max, j);
}
}
}
image = await image.crop([x_min, y_min, x_max, y_max]);
return image;
}
/**
* Pad the image by a certain amount.
* @param {Float32Array} pixelData The pixel data to pad.
* @param {number[]} imgDims The dimensions of the image (height, width, channels).
* @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
* @param {Object} options The options for padding.
* @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
* @param {boolean} [options.center=false] Whether to center the image.
* @param {number|number[]} [options.constant_values=0] The constant value to use for padding.
* @returns {[Float32Array, number[]]} The padded pixel data and image dimensions.
*/
pad_image(pixelData, imgDims, padSize, {
mode = 'constant',
center = false,
constant_values = 0,
} = {}) {
const [imageHeight, imageWidth, imageChannels] = imgDims;
let paddedImageWidth, paddedImageHeight;
if (typeof padSize === 'number') {
paddedImageWidth = padSize;
paddedImageHeight = padSize;
} else {
paddedImageWidth = padSize.width;
paddedImageHeight = padSize.height;
}
// Only add padding if there is a difference in size
if (paddedImageWidth !== imageWidth || paddedImageHeight !== imageHeight) {
const paddedPixelData = new Float32Array(paddedImageWidth * paddedImageHeight * imageChannels);
if (Array.isArray(constant_values)) {
// Fill with constant values, cycling through the array
for (let i = 0; i < paddedPixelData.length; ++i) {
paddedPixelData[i] = constant_values[i % imageChannels];
}
} else if (constant_values !== 0) {
paddedPixelData.fill(constant_values);
}
const [left, top] = center
? [Math.floor((paddedImageWidth - imageWidth) / 2), Math.floor((paddedImageHeight - imageHeight) / 2)]
: [0, 0];
// Copy the original image into the padded image
for (let i = 0; i < imageHeight; ++i) {
const a = (i + top) * paddedImageWidth;
const b = i * imageWidth;
for (let j = 0; j < imageWidth; ++j) {
const c = (a + j + left) * imageChannels;
const d = (b + j) * imageChannels;
for (let k = 0; k < imageChannels; ++k) {
paddedPixelData[c + k] = pixelData[d + k];
}
}
}
if (mode === 'symmetric') {
if (center) {
throw new Error('`center` padding is not supported when `mode` is set to `symmetric`.');
// TODO: Implement this
}
const h1 = imageHeight - 1;
const w1 = imageWidth - 1;
for (let i = 0; i < paddedImageHeight; ++i) {
const a = i * paddedImageWidth;
const b = calculateReflectOffset(i, h1) * imageWidth;
for (let j = 0; j < paddedImageWidth; ++j) {
if (i < imageHeight && j < imageWidth) continue; // Do not overwrite original image
const c = (a + j) * imageChannels;
const d = (b + calculateReflectOffset(j, w1)) * imageChannels;
// Copy channel-wise
for (let k = 0; k < imageChannels; ++k) {
paddedPixelData[c + k] = pixelData[d + k];
}
}
}
}
// Update pixel data and image dimensions
pixelData = paddedPixelData;
imgDims = [paddedImageHeight, paddedImageWidth, imageChannels]
}
return [pixelData, imgDims];
}
/**
* Rescale the image' pixel values by `this.rescale_factor`.
* @param {Float32Array} pixelData The pixel data to rescale.
* @returns {void}
*/
rescale(pixelData) {
for (let i = 0; i < pixelData.length; ++i) {
pixelData[i] = this.rescale_factor * pixelData[i];
}
}
/**
* Find the target (width, height) dimension of the output image after
* resizing given the input image and the desired size.
* @param {RawImage} image The image to resize.
* @param {any} size The size to use for resizing the image.
* @returns {[number, number]} The target (width, height) dimension of the output image after resizing.
*/
get_resize_output_image_size(image, size) {
// `size` comes in many forms, so we need to handle them all here:
// 1. `size` is an integer, in which case we resize the image to be a square
const [srcWidth, srcHeight] = image.size;
let shortest_edge;
let longest_edge;
if (this.do_thumbnail) {
// NOTE: custom logic for `Donut` models
const { height, width } = size;
shortest_edge = Math.min(height, width)
}
// Support both formats for backwards compatibility
else if (Number.isInteger(size)) {
shortest_edge = size;
longest_edge = this.config.max_size ?? shortest_edge;
} else if (size !== undefined) {
// Extract known properties from `size`
shortest_edge = size.shortest_edge;
longest_edge = size.longest_edge;
}
// If `longest_edge` and `shortest_edge` are set, maintain aspect ratio and resize to `shortest_edge`
// while keeping the largest dimension <= `longest_edge`
if (shortest_edge !== undefined || longest_edge !== undefined) {
// http://opensourcehacker.com/2011/12/01/calculate-aspect-ratio-conserving-resize-for-images-in-javascript/
// Try resize so that shortest edge is `shortest_edge` (target)
const shortResizeFactor = shortest_edge === undefined
? 1 // If `shortest_edge` is not set, don't upscale
: Math.max(shortest_edge / srcWidth, shortest_edge / srcHeight);
const newWidth = srcWidth * shortResizeFactor;
const newHeight = srcHeight * shortResizeFactor;
// The new width and height might be greater than `longest_edge`, so
// we downscale again to ensure the largest dimension is `longest_edge`
const longResizeFactor = longest_edge === undefined
? 1 // If `longest_edge` is not set, don't downscale
: Math.min(longest_edge / newWidth, longest_edge / newHeight);
// To avoid certain floating point precision issues, we round to 2 decimal places
let finalWidth = Math.floor(Number((newWidth * longResizeFactor).toFixed(2)));
let finalHeight = Math.floor(Number((newHeight * longResizeFactor).toFixed(2)));
if (this.size_divisibility !== undefined) {
[finalWidth, finalHeight] = enforce_size_divisibility([finalWidth, finalHeight], this.size_divisibility)
}
return [finalWidth, finalHeight];
} else if (size !== undefined && size.width !== undefined && size.height !== undefined) {
// If `width` and `height` are set, resize to those dimensions
let newWidth = size.width;
let newHeight = size.height;
// Custom for DPT models
if (this.config.keep_aspect_ratio && this.config.ensure_multiple_of) {
// determine new height and width
let scale_height = newHeight / srcHeight;
let scale_width = newWidth / srcWidth;
// scale as little as possible
if (Math.abs(1 - scale_width) < Math.abs(1 - scale_height)) {
// fit width
scale_height = scale_width;
} else {
// fit height
scale_width = scale_height;
}
newHeight = constraint_to_multiple_of(scale_height * srcHeight, this.config.ensure_multiple_of);
newWidth = constraint_to_multiple_of(scale_width * srcWidth, this.config.ensure_multiple_of);
}
return [newWidth, newHeight];
} else if (this.size_divisibility !== undefined) {
return enforce_size_divisibility([srcWidth, srcHeight], this.size_divisibility);
} else if (size.min_pixels !== undefined && size.max_pixels !== undefined) {
// Custom resize logic for Qwen2-VL models
const { min_pixels, max_pixels } = size;
const factor = this.config.patch_size * this.config.merge_size;
return smart_resize(srcHeight, srcWidth, factor, min_pixels, max_pixels);
} else {
throw new Error(`Could not resize image due to unsupported \`this.size\` option in config: ${JSON.stringify(size)}`);
}
}
/**
* Resizes the image.
* @param {RawImage} image The image to resize.
* @returns {Promise<RawImage>} The resized image.
*/
async resize(image) {
const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
return await image.resize(newWidth, newHeight, {
resample: this.resample,
});
}
/**
* @typedef {object} PreprocessedImage
* @property {HeightWidth} original_size The original size of the image.
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
*/
/**
* Preprocesses the given image.
*
* @param {RawImage} image The image to preprocess.
* @param {Object} overrides The overrides for the preprocessing options.
* @returns {Promise<PreprocessedImage>} The preprocessed image.
*/
async preprocess(image, {
do_normalize = null,
do_pad = null,
do_convert_rgb = null,
do_convert_grayscale = null,
do_flip_channel_order = null,
} = {}) {
if (this.do_crop_margin) {
// NOTE: Specific to nougat processors. This is done before resizing,
// and can be interpreted as a pre-preprocessing step.
image = await this.crop_margin(image);
}
const [srcWidth, srcHeight] = image.size; // original image size
// Convert image to RGB if specified in config.
if (do_convert_rgb ?? this.do_convert_rgb) {
image = image.rgb();
} else if (do_convert_grayscale) {
image = image.grayscale();
}
// TODO:
// For efficiency reasons, it might be best to merge the resize and center crop operations into one.
// Resize all images
if (this.do_resize) {
image = await this.resize(image);
}
// Resize the image using thumbnail method.
if (this.do_thumbnail) {
image = await this.thumbnail(image, this.size, this.resample);
}
if (this.do_center_crop) {
let crop_width;
let crop_height;
if (Number.isInteger(this.crop_size)) {
crop_width = this.crop_size;
crop_height = this.crop_size;
} else {
crop_width = this.crop_size.width;
crop_height = this.crop_size.height;
}
image = await image.center_crop(crop_width, crop_height);
}
/** @type {HeightWidth} */
const reshaped_input_size = [image.height, image.width];
// NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
// occurs with data in the hwc format (height, width, channels),
// to emulate the behavior of the original Python code (w/ numpy).
let pixelData = Float32Array.from(image.data);
let imgDims = [image.height, image.width, image.channels];
if (this.do_rescale) {
this.rescale(pixelData);
}
if (do_normalize ?? this.do_normalize) {
let image_mean = this.image_mean;
if (!Array.isArray(this.image_mean)) {
image_mean = new Array(image.channels).fill(image_mean);
}
let image_std = this.image_std;
if (!Array.isArray(this.image_std)) {
image_std = new Array(image.channels).fill(image_mean);
}
if (image_mean.length !== image.channels || image_std.length !== image.channels) {
throw new Error(`When set to arrays, the length of \`image_mean\` (${image_mean.length}) and \`image_std\` (${image_std.length}) must match the number of channels in the image (${image.channels}).`);
}
for (let i = 0; i < pixelData.length; i += image.channels) {
for (let j = 0; j < image.channels; ++j) {
pixelData[i + j] = (pixelData[i + j] - image_mean[j]) / image_std[j];
}
}
}
// do padding after rescaling/normalizing
if (do_pad ?? this.do_pad) {
if (this.pad_size) {
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
[pixelData, imgDims] = padded; // Update pixel data and image dimensions
} else if (this.size_divisibility) {
const [paddedWidth, paddedHeight] = enforce_size_divisibility([imgDims[1], imgDims[0]], this.size_divisibility);
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
}
}
if (do_flip_channel_order ?? this.do_flip_channel_order) {
if (imgDims[2] !== 3) {
throw new Error('Flipping channel order is only supported for RGB images.');
}
// Convert RGB to BGR
for (let i = 0; i < pixelData.length; i += 3) {
const temp = pixelData[i];
pixelData[i] = pixelData[i + 2];
pixelData[i + 2] = temp;
}
}
const pixel_values = new Tensor('float32', pixelData, imgDims)
.permute(2, 0, 1); // convert to channel dimension format (hwc -> chw)
return {
original_size: [srcHeight, srcWidth],
reshaped_input_size: reshaped_input_size,
pixel_values,
}
}
/**
* Calls the feature extraction process on an array of images,
* preprocesses each image, and concatenates the resulting
* features into a single Tensor.
* @param {RawImage[]} images The image(s) to extract features from.
* @param {...any} args Additional arguments.
* @returns {Promise<ImageProcessorResult>} An object containing the concatenated pixel values (and other metadata) of the preprocessed images.
*/
async _call(images, ...args) {
if (!Array.isArray(images)) {
images = [images];
}
/** @type {PreprocessedImage[]} */
const imageData = await Promise.all(images.map(x => this.preprocess(x)));
// Stack pixel values
const pixel_values = stack(imageData.map(x => x.pixel_values), 0);
return {
pixel_values,
// Original sizes of images
original_sizes: imageData.map(x => x.original_size),
// Reshaped sizes of images, before padding or cropping
reshaped_input_sizes: imageData.map(x => x.reshaped_input_size),
}
}
/**
* Instantiate one of the processor classes of the library from a pretrained model.
*
* The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
* property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
*
* @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
* - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
* Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
* user or organization name, like `dbmdz/bert-base-german-cased`.
* - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
* @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
*
* @returns {Promise<ImageProcessor>} A new instance of the Processor class.
*/
static async from_pretrained(pretrained_model_name_or_path, options) {
const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME, true, options);
return new this(preprocessorConfig);
}
}