@huggingface/transformers
Version:
State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!
54 lines (44 loc) • 1.95 kB
JavaScript
import { Processor } from "../../base/processing_utils.js";
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
import { AutoTokenizer } from "../../tokenizers.js";
import { RawImage } from "../../utils/image.js";
const IMAGE_TOKEN = "<|image|>";
const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
export class Phi3VProcessor extends Processor {
static image_processor_class = AutoImageProcessor
static tokenizer_class = AutoTokenizer
/**
*
* @param {string|string[]} text
* @param {RawImage|RawImage[]} images
* @param { { padding?: boolean, truncation?: boolean, num_crops?: number } | undefined } options
* @returns {Promise<any>}
*/
async _call(text, images = null, {
padding = true,
truncation = true,
num_crops = null,
} = {}) {
if (!Array.isArray(text)) {
text = [text];
}
let text_inputs, image_inputs;
if (images) {
image_inputs = await this.image_processor(images, { num_crops });
const { num_img_tokens } = image_inputs;
// The original implementation adds a bos_token before the image tokens
// TODO: Check if this affects performance, since it looks like a bug in the original implementation
const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
// The model expects image tokens to be negative, so we negate the image token ids
const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
} else {
text_inputs = this.tokenizer(text);
}
return {
...text_inputs,
...image_inputs,
}
}
}