@aj-archipelago/cortex
Version:
Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.
290 lines (236 loc) • 9.38 kB
JavaScript
import logger from "../lib/logger.js";
import { publishRequestProgress } from "../lib/redisSubscription.js";
import { alignSubtitles, getMediaChunks } from "../lib/util.js";
import { Prompt } from "../server/prompt.js";
const OFFSET_CHUNK = 500; //seconds of each chunk offset, only used if helper does not provide
function isYoutubeUrl(url) {
try {
const urlObj = new URL(url);
// Check for standard youtube.com domains
if (
urlObj.hostname === "youtube.com" ||
urlObj.hostname === "www.youtube.com"
) {
// For standard watch URLs, verify they have a video ID
if (urlObj.pathname === "/watch") {
return !!urlObj.searchParams.get("v");
}
// For embed URLs, verify they have a video ID in the path
if (urlObj.pathname.startsWith("/embed/")) {
return urlObj.pathname.length > 7; // '/embed/' is 7 chars
}
// For shorts URLs, verify they have a video ID in the path
if (urlObj.pathname.startsWith("/shorts/")) {
return urlObj.pathname.length > 8; // '/shorts/' is 8 chars
}
return false;
}
// Check for shortened youtu.be domain
if (urlObj.hostname === "youtu.be") {
// Verify there's a video ID in the path
return urlObj.pathname.length > 1; // '/' is 1 char
}
return false;
} catch (err) {
return false;
}
}
export default {
prompt:
[
new Prompt({ messages: [
"{{messages}}",
]}),
],
model: 'gemini-pro-25-vision',
inputParameters: {
file: ``,
language: ``,
responseFormat: `text`,
wordTimestamped: false,
highlightWords: false,
maxLineWidth: 0,
maxLineCount: 0,
maxWordsPerLine: 0,
contextId: ``,
},
timeout: 3600, // in seconds
enableDuplicateRequests: false,
geminiSafetySettings: [{category: 'HARM_CATEGORY_DANGEROUS_CONTENT', threshold: 'BLOCK_ONLY_HIGH'},
{category: 'HARM_CATEGORY_SEXUALLY_EXPLICIT', threshold: 'BLOCK_ONLY_HIGH'},
{category: 'HARM_CATEGORY_HARASSMENT', threshold: 'BLOCK_ONLY_HIGH'},
{category: 'HARM_CATEGORY_HATE_SPEECH', threshold: 'BLOCK_ONLY_HIGH'}],
executePathway: async ({args, runAllPrompts, resolver}) => {
let intervalId;
const { requestId } = resolver;
try{
let totalCount = 11; //init max chunk value
let completedCount = 0;
let partialCount = 0;
let partialRatio = 0;
const sendProgress = (partial=false, resetCount=false) => {
partialCount = resetCount ? 0 : partialCount;
if(partial){
partialCount++;
const increment = 0.02 / Math.log2(partialCount + 1); // logarithmic diminishing increment
partialRatio = Math.min(partialRatio + increment, 0.99); // limit to 0.99
}else{
partialCount = 0;
partialRatio = 0;
completedCount++;
}
if(completedCount >= totalCount) return;
const progress = (completedCount + partialRatio) / totalCount;
logger.info(`Progress for ${requestId}: ${progress}`);
publishRequestProgress({
requestId,
progress,
data: null,
});
}
sendProgress(true);
intervalId = setInterval(() => sendProgress(true), 3000);
const { file, wordTimestamped, maxLineWidth } = args;
const responseFormat = args.responseFormat || 'text';
if(!file) {
throw new Error("Please provide a file to transcribe.");
}
//check if fils is a gcs file or youtube
const isGcs = file.startsWith('gs://');
const isYoutube = isYoutubeUrl(file);
let chunks = [{
url: file,
gcs: file,
offset: 0,
}];
if(!isGcs && !isYoutube) {
//get chunks from helper api if not gcs or youtube
chunks = await getMediaChunks(file, requestId);
}
totalCount = chunks.length+1;
logger.info(`Processing chunks: ${JSON.stringify(chunks)}`);
sendProgress(true);
let respectLimitsPrompt = "";
if (maxLineWidth) {
const possiblePlacement = maxLineWidth <= 25
? "vertical" : maxLineWidth <= 35 ? "horizontal" : "";
respectLimitsPrompt += ` These subtitles will be shown in a ${possiblePlacement} formatted video player. Each subtitle line should not exceed ${maxLineWidth} characters to fit the player.`;
}
function getMessages(file) {
// Base system content that's always included
let systemContent = `Instructions:
You are a transcription assistant. Your job is to transcribe the audio/video content accurately.
IMPORTANT: Only provide the transcription in your response - no explanations, comments, or additional text.
Format your response in ${responseFormat} format.`;
// Only include timestamp instructions if we're not using plain text format
if (responseFormat !== 'text') {
systemContent += `
CRITICAL TIMESTAMP INSTRUCTIONS:
- Timestamps MUST match the actual timing in the media
- For each new segment, look at the media time directly
- Start times should precisely match when spoken words begin
- Consecutive segments should have matching end/start times (no gaps or overlaps)`;
}
systemContent += `
Examples:
SRT format:
1
00:00:00,498 --> 00:00:02,827
Hello World!
2
00:00:02,827 --> 00:00:06,383
Being AI is fun!
VTT format:
WEBVTT
1
00:00:00.000 --> 00:00:02.944
Hello World!
2
00:00:02.944 --> 00:00:08.809
Being AI is great!
Text format:
Hello World! Being AI is great!`;
if (wordTimestamped) {
systemContent += `
For word-level transcription, timestamp each word:
WEBVTT
1
00:00:00.000 --> 00:00:01.944
Hello
2
00:00:01.944 --> 00:00:02.383
World!
`;
}
// Only include anti-drift procedure and timestamp reminders for non-text formats
if (responseFormat !== 'text') {
systemContent += `
ANTI-DRIFT PROCEDURE:
1. For EVERY new segment, check the actual media time directly
2. After every 5 segments, verify your timestamps against the video/audio
3. Never calculate timestamps based on previous segments
4. Always match the end time of one segment with the start time of the next
REMEMBER:
- Transcription accuracy is your primary goal
- Timestamp accuracy is equally important
- Timestamp drift is the most common error - actively prevent it
- When in doubt, check the media time directly`;
}
const messages = [
{"role": "system", "content": systemContent},
{"role": "user", "content": [
`{ type: 'text', text: 'Transcribe this file in ${responseFormat} format.${respectLimitsPrompt} Output only the transcription, no other text or comments or formatting.' }`,
JSON.stringify({
type: 'image_url',
url: file,
gcs: file
})
]},
]
return messages;
}
const processChunksParallel = async (chunks, args) => {
try {
const chunkPromises = chunks.map(async (chunk, index) => ({
index,
result: await runAllPrompts({
...args,
messages: getMessages(chunk.gcs || chunk.uri, responseFormat),
requestId: `${requestId}-${index}`
})
}));
const results = await Promise.all(
chunkPromises.map(promise =>
promise.then(result => {
sendProgress();
return result;
})
));
return results
.sort((a, b) => a.index - b.index)
.map(item => item.result);
} catch (error) {
logger.error('Error processing chunks:', error);
throw error;
}
};
// serial processing of chunks
// const result = [];
// for(const chunk of chunks) {
// const chunkResult = await runAllPrompts({ ...args, messages: getMessages(chunk.gcs || chunk.uri) });
// result.push(chunkResult);
// }
const result = await processChunksParallel(chunks, args);
if (['srt','vtt'].includes(responseFormat.toLowerCase()) || wordTimestamped) { // align subtitles for formats
const offsets = chunks.map((chunk, index) => chunk?.offset || index * OFFSET_CHUNK);
return alignSubtitles(result, responseFormat, offsets);
}
return result.join(` `);
}catch(error){
logger.error(`Error in transcribing: ${error}`);
throw error;
}finally{
intervalId && clearInterval(intervalId);
}
}
};