UNPKG

transformers-fork

Version:

State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!

53 lines (43 loc) • 1.72 kB
import { Processor } from "../../base/processing_utils.js"; import { AutoImageProcessor } from "../auto/image_processing_auto.js"; import { AutoTokenizer } from "../../tokenizers.js"; import { RawImage } from "../../utils/image.js"; export class Qwen2VLProcessor extends Processor { static image_processor_class = AutoImageProcessor static tokenizer_class = AutoTokenizer /** * * @param {string|string[]} text * @param {RawImage|RawImage[]} images * @param {...any} args * @returns {Promise<any>} */ async _call(text, images = null, ...args) { if (!Array.isArray(text)) { text = [text]; } let image_inputs, image_grid_thw; if (images) { image_inputs = await this.image_processor(images); image_grid_thw = image_inputs.image_grid_thw; } if (image_grid_thw) { let merge_length = this.image_processor.config.merge_size ** 2; let index = 0; const image_grid_thw_list = image_grid_thw.tolist(); text = text.map(t => { while (t.includes("<|image_pad|>")) { const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n)); t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length))); } return t.replaceAll("<|placeholder|>", "<|image_pad|>"); }); } const text_inputs = this.tokenizer(text); return { ...text_inputs, ...image_inputs, // TODO: ...videos_inputs, } } }