UNPKG

transformers-fork

Version:

State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!

129 lines (108 loc) • 4.9 kB
import { Processor } from "../../base/processing_utils.js"; import { AutoImageProcessor } from "../auto/image_processing_auto.js"; import { AutoTokenizer } from "../../tokenizers.js"; export class Florence2Processor extends Processor { static tokenizer_class = AutoTokenizer static image_processor_class = AutoImageProcessor constructor(config, components) { super(config, components); const { tasks_answer_post_processing_type, task_prompts_without_inputs, task_prompts_with_input, } = this.image_processor.config; /** @type {Map<string, string>} */ this.tasks_answer_post_processing_type = new Map(Object.entries(tasks_answer_post_processing_type ?? {})); /** @type {Map<string, string>} */ this.task_prompts_without_inputs = new Map(Object.entries(task_prompts_without_inputs ?? {})); /** @type {Map<string, string>} */ this.task_prompts_with_input = new Map(Object.entries(task_prompts_with_input ?? {})); this.regexes = { quad_boxes: /(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>/gm, bboxes: /([^<]+)?<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>/gm, } this.size_per_bin = 1000; } /** * Helper function to construct prompts from input texts * @param {string|string[]} text * @returns {string[]} */ construct_prompts(text) { if (typeof text === 'string') { text = [text]; } const prompts = []; for (const t of text) { // 1. fixed task prompts without additional inputs if (this.task_prompts_without_inputs.has(t)) { prompts.push(this.task_prompts_without_inputs.get(t)); } // 2. task prompts with additional inputs else { for (const [task, prompt] of this.task_prompts_with_input) { if (t.includes(task)) { prompts.push(prompt.replaceAll('{input}', t).replaceAll(task, '')); break; } } // 3. default prompt if (prompts.length !== text.length) { prompts.push(t); } } } return prompts; } /** * Post-process the output of the model to each of the task outputs. * @param {string} text The text to post-process. * @param {string} task The task to post-process the text for. * @param {[number, number]} image_size The size of the image. height x width. */ post_process_generation(text, task, image_size) { const task_answer_post_processing_type = this.tasks_answer_post_processing_type.get(task) ?? 'pure_text'; // remove the special tokens text = text.replaceAll('<s>', '').replaceAll('</s>', ''); let final_answer; switch (task_answer_post_processing_type) { case 'pure_text': final_answer = text; break; case 'description_with_bboxes': case 'bboxes': case 'phrase_grounding': case 'ocr': const key = task_answer_post_processing_type === 'ocr' ? 'quad_boxes' : 'bboxes'; const matches = text.matchAll(this.regexes[key]); const labels = []; const items = []; for (const [_, label, ...locations] of matches) { // Push new label, or duplicate the last label labels.push(label ? label.trim() : labels.at(-1) ?? ''); items.push(locations.map((x, i) => // NOTE: Add 0.5 to use the center position of the bin as the coordinate. (Number(x) + 0.5) / this.size_per_bin * image_size[i % 2]) ); } final_answer = { labels, [key]: items }; break; default: throw new Error(`Task "${task}" (of type "${task_answer_post_processing_type}") not yet implemented.`); } return { [task]: final_answer } } // NOTE: images and text are switched from the python version // `images` is required, `text` is optional async _call(images, text=null, kwargs = {}) { if (!images && !text){ throw new Error('Either text or images must be provided'); } const image_inputs = await this.image_processor(images, kwargs); const text_inputs = text ? this.tokenizer(text, kwargs) : {}; return { ...image_inputs, ...text_inputs, } } }