@huggingface/transformers
Version:
State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!
75 lines (62 loc) • 2.7 kB
JavaScript
import { Processor } from "../../base/processing_utils.js";
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
import { AutoTokenizer } from "../../tokenizers.js";
import { RawImage } from "../../utils/image.js";
import { RawAudio } from "../../utils/audio.js";
export class Gemma3nProcessor extends Processor {
static image_processor_class = AutoImageProcessor;
static feature_extractor_class = AutoFeatureExtractor;
static tokenizer_class = AutoTokenizer;
static uses_processor_config = true;
static uses_chat_template_file = true;
constructor(config, components, chat_template) {
super(config, components, chat_template);
this.audio_seq_length = this.config.audio_seq_length;
this.image_seq_length = this.config.image_seq_length;
const {
// Audio tokens
audio_token_id, boa_token, audio_token, eoa_token,
// Image tokens
image_token_id, boi_token, image_token, eoi_token
} = this.tokenizer.config;
this.audio_token_id = audio_token_id
this.boa_token = boa_token
this.audio_token = audio_token
const audio_tokens_expanded = audio_token.repeat(this.audio_seq_length);
this.full_audio_sequence = `\n\n${boa_token}${audio_tokens_expanded}${eoa_token}\n\n`
this.image_token_id = image_token_id
this.boi_token = boi_token
this.image_token = image_token
const image_tokens_expanded = image_token.repeat(this.image_seq_length);
this.full_image_sequence = `\n\n${boi_token}${image_tokens_expanded}${eoi_token}\n\n`
}
/**
*
* @param {string|string[]} text
* @param {RawImage|RawImage[]|RawImage[][]} images
* @param {RawAudio|RawAudio[]|RawAudio[][]} audio
* @returns {Promise<any>}
*/
async _call(text, images = null, audio = null, options = {}) {
if (typeof text === 'string') {
text = [text];
}
let audio_inputs;
if (audio) {
audio_inputs = await this.feature_extractor(audio, options);
text = text.map(prompt => prompt.replaceAll(this.audio_token, this.full_audio_sequence));
}
let image_inputs;
if (images) {
image_inputs = await this.image_processor(images, options);
text = text.map(prompt => prompt.replaceAll(this.image_token, this.full_image_sequence));
}
let text_inputs = this.tokenizer(text, options);
return {
...text_inputs,
...image_inputs,
...audio_inputs,
}
}
}