UNPKG

viral-video-kit

Version:

CLI to generate 60s vertical video kits and optionally render MP4 using ffmpeg

613 lines (553 loc) 20.2 kB
import fs from "node:fs/promises"; import path from "node:path"; import os from "node:os"; import { spawn } from "node:child_process"; import { fileURLToPath } from "node:url"; import OpenAI from "openai"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // ---------- Config ---------- const WIDTH = 1080; const HEIGHT = 1920; const FPS = 30; const DEFAULTS = { TEXT_MODEL: "gpt-5", IMAGE_MODEL: "gpt-image-1", TTS_MODEL: "gpt-4o-mini-tts", TTS_VOICE: "alloy", VIDEO_SEC: 60, SCENES_COUNT: 6, }; // Load user config from ~/.config/viral-video/config.json (or XDG_CONFIG_HOME) function configPaths() { const home = os.homedir(); const cfgRoot = process.env.XDG_CONFIG_HOME || path.join(home, ".config"); const dir = path.join(cfgRoot, "viral-video"); const file = path.join(dir, "config.json"); return { dir, file }; } export async function loadUserConfig() { const { file } = configPaths(); try { const raw = await fs.readFile(file, "utf8"); const json = JSON.parse(raw); return json && typeof json === "object" ? json : {}; } catch { return {}; } } function slugify(s) { return s.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/(^-|-$)/g, ""); } async function ensureDir(p) { await fs.mkdir(p, { recursive: true }); } async function writeJSON(p, obj) { await fs.writeFile(p, JSON.stringify(obj, null, 2), "utf8"); } function imageStyleBlock(imageStyle) { if (imageStyle === "realistic") { return "photorealistic, high detail, realistic lighting, natural textures"; } if (imageStyle === "ai-generated") { return "AI-generated art style, algorithmic patterns, modern generative design"; } // default cartoon return "stylized, cartoon, bold outlines, soft gradients, high contrast"; } // ---------- Generators ---------- async function generateScript({ topic, client, cfg, dryRun }) { if (dryRun) { const sections = [ { label: "Intro/Context", sec: 10, text: `Intro on: ${topic}` }, { label: "Point 1", sec: 14, text: `Point 1 about ${topic}` }, { label: "Point 2", sec: 14, text: `Point 2 about ${topic}` }, { label: "Point 3", sec: 10, text: `Point 3 about ${topic}` }, { label: "Wrap/CTA", sec: 12, text: `Wrap and CTA for ${topic}` }, ]; const scenes = Array.from({ length: cfg.SCENES_COUNT }, (_, i) => ({ i: i + 1, sec: Math.round(cfg.VIDEO_SEC / cfg.SCENES_COUNT), text: sections[Math.min(i, sections.length - 1)].text, })); const imagePrompts = Array.from( { length: cfg.SCENES_COUNT }, (_, i) => `Placeholder scene ${i + 1} for ${topic}, vertical 1080x1920.` ); return { title: topic, hook: `Why ${topic} matters in 60 seconds`, sections, scenes, imagePrompts, ttsStyle: "male, smooth, educational", disclaimer: "Educational only. Not financial advice.", }; } const prompt = `You are a concise scriptwriter for 60-second vertical videos (TikTok). Audience: beginner to intermediate. Goal: educational, calm, trustworthy voice. Topic: "${topic}" Deliver JSON with: { "title": "Short catchy title", "hook": "0-3s strong hook", "sections": [ {"label": "Intro/Context", "sec": 10, "text": "..."}, {"label": "Point 1", "sec": 14, "text": "..."}, {"label": "Point 2", "sec": 14, "text": "..."}, {"label": "Point 3", "sec": 10, "text": "..."}, {"label": "Wrap/CTA", "sec": 9, "text": "..."} ], "image_prompts": [ // exactly ${cfg.SCENES_COUNT} prompts for vertical 1080x1920 frames, descriptive, vivid, non-duplicative ], "tts_style": "male or female, smooth, educational", "disclaimer": "Educational only. Not financial advice." } Total seconds should sum to ~${cfg.VIDEO_SEC}. Keep jargon minimal.`; const res = await client.chat.completions.create({ model: cfg.TEXT_MODEL, messages: [ { role: "system", content: "Return only valid JSON. No commentary." }, { role: "user", content: prompt }, ],response_format: { type: "json_object" }, }); let data; try { data = JSON.parse(res.choices[0].message.content); } catch { throw new Error("Model did not return valid JSON."); } const total = data.sections?.reduce((a, b) => a + (b.sec || 0), 0) || cfg.VIDEO_SEC; const scale = cfg.VIDEO_SEC / Math.max(1, total); let durations = data.sections?.map((s) => Math.max(2, Math.round(s.sec * scale))) || []; let sum = durations.reduce((a, b) => a + b, 0); while (sum > cfg.VIDEO_SEC) { durations[durations.length - 1]--; sum--; } while (sum < cfg.VIDEO_SEC) { durations[durations.length - 1]++; sum++; } const scenes = []; let idx = 0; for (let i = 0; i < cfg.SCENES_COUNT; i++) { const sec = Math.round(cfg.VIDEO_SEC / cfg.SCENES_COUNT); const secText = data.sections?.[idx]?.text || data.hook || data.title; scenes.push({ i: i + 1, sec, text: secText }); idx = Math.min(idx + 1, (data.sections?.length || 1) - 1); } let imagePrompts = Array.isArray(data.image_prompts) ? data.image_prompts.slice(0, cfg.SCENES_COUNT) : []; while (imagePrompts.length < cfg.SCENES_COUNT) { imagePrompts.push(`Vertical frame illustrating "${topic}", clean composition, high contrast, 1080x1920.`); } return { title: data.title || topic, hook: data.hook || "", sections: data.sections || [], scenes, imagePrompts, ttsStyle: data.tts_style || "male, smooth, educational", disclaimer: data.disclaimer || "", }; } async function generateImage({ promptText, outPng, client, cfg, dryRun, imageStyle }) { if (dryRun) { await fs.writeFile(outPng, ""); return; } const style = imageStyleBlock(imageStyle); const img = await client.images.generate({ model: cfg.IMAGE_MODEL, // OpenAI Images API supports: 1024x1024, 1024x1536 (portrait), 1536x1024 (landscape), or "auto" // We generate portrait at 1024x1536, then ffmpeg scales to 1080x1920 during render. size: "1024x1536", prompt: `${promptText}\nStyle: ${style}; vertical 1080x1920, clean composition, minimal text.`, quality: "high", }); const b64 = img.data[0].b64_json; const buf = Buffer.from(b64, "base64"); await fs.writeFile(outPng, buf); } async function synthesizeTTS({ text, outMp3, client, cfg, dryRun }) { if (dryRun) { await fs.writeFile(outMp3, ""); return; } const speech = await client.audio.speech.create({ model: cfg.TTS_MODEL, voice: cfg.TTS_VOICE, input: text, }); const buf = Buffer.from(await speech.arrayBuffer()); await fs.writeFile(outMp3, buf); } // ---------- Captions / Storyboard ---------- function splitForCaptions(text, totalSec) { const parts = text.replace(/\n+/g, " ").split(/(?<=[.!?])\s+/).filter(Boolean).slice(0, 10); const each = Math.max(2, Math.floor(totalSec / Math.max(1, parts.length))); const spans = []; let t = 0; for (let i = 0; i < parts.length; i++) { const start = t; const end = i === parts.length - 1 ? totalSec : Math.min(totalSec, t + each); spans.push({ start, end, text: parts[i] }); t = end; } if (spans.length === 0) spans.push({ start: 0, end: totalSec, text }); return spans; } function toAss(cues) { const header = ` [Script Info] Title=Captions ScriptType=v4.00+ PlayResX=${WIDTH} PlayResY=${HEIGHT} ScaledBorderAndShadow=yes [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Caption,Montserrat SemiBold,64,&H00FFFFFF,&H00000000,&H96000000,&H64000000,-1,0,0,0,100,100,0,0,1,6,0,2,80,80,120,0 [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text `.trim(); const lines = cues.map((c) => { const fmt = (s) => { const hh = String(Math.floor(s / 3600)).padStart(2, "0"); const mm = String(Math.floor((s % 3600) / 60)).padStart(2, "0"); const ss = (s % 60).toFixed(2).padStart(5, "0"); return `${hh}:${mm}:${ss}`; }; return `Dialogue: 0,${fmt(c.start)},${fmt(c.end)},Caption,,0,0,0,,${c.text.replace(/\n/g, "\\N")}`; }); return `${header}\n${lines.join("\n")}\n`; } function toStoryboard(scenePngs, perScene) { const rows = ["filename,start,duration,cue"]; let t = 0; for (let i = 0; i < scenePngs.length; i++) { rows.push(`${path.basename(scenePngs[i])},${t},${perScene},${i + 1}`); t += perScene; } return rows.join("\n") + "\n"; } // ---------- ffmpeg helpers ---------- async function hasFfmpeg() { return new Promise((resolve) => { const p = spawn("ffmpeg", ["-version"]); p.on("error", () => resolve(false)); p.on("close", (code) => resolve(code === 0)); }); } async function renderVideo(outDir, perScene) { const scenesDir = path.join(outDir, "scenes"); const buildDir = path.join(outDir, "build"); await ensureDir(path.join(buildDir, "segs")); const csv = await fs.readFile(path.join(outDir, "storyboard.csv"), "utf8"); const lines = csv.trim().split("\n").slice(1); for (const line of lines) { if (!line.trim()) continue; const [fname, , durationStr] = line.split(","); const duration = parseInt(durationStr, 10); const base = fname.replace(/\.png$/i, ""); const inP = path.join(scenesDir, fname); const outP = path.join(buildDir, "segs", `${base}.mp4`); await new Promise((resolve, reject) => { const args = [ "-nostdin", "-y", "-loop", "1", "-t", String(duration), "-i", inP, "-vf", `scale=${WIDTH}:${HEIGHT},zoompan=z='min(zoom+0.0009,1.06)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':fps=${FPS}:d=${ duration * FPS },format=yuv420p`, "-r", String(FPS), "-pix_fmt", "yuv420p", "-an", outP, ]; const p = spawn("ffmpeg", args, { stdio: "inherit" }); p.on("error", reject); p.on("close", (code) => (code === 0 ? resolve() : reject(new Error("ffmpeg seg fail")))); }); } const concatTxt = (await fs.readdir(path.join(buildDir, "segs"))) .filter((f) => f.endsWith(".mp4")) .sort() .map((f) => `file '${path.join(buildDir, "segs", f)}'`) .join("\n"); const concatPath = path.join(buildDir, "concat.txt"); await fs.writeFile(concatPath, concatTxt, "utf8"); const nocaptions = path.join(buildDir, "video_nocaptions.mp4"); await new Promise((resolve, reject) => { const p = spawn("ffmpeg", ["-nostdin", "-y", "-f", "concat", "-safe", "0", "-i", concatPath, "-c", "copy", nocaptions], { stdio: "inherit", }); p.on("error", reject); p.on("close", (code) => (code === 0 ? resolve() : reject(new Error("concat fail")))); }); const captions = path.join(outDir, "captions.ass"); const withCaptions = path.join(buildDir, "video_captions.mp4"); await new Promise((resolve, reject) => { const p = spawn("ffmpeg", ["-nostdin", "-y", "-i", nocaptions, "-vf", `ass=${captions}`, "-c:a", "copy", withCaptions], { stdio: "inherit", }); p.on("error", reject); p.on("close", (code) => (code === 0 ? resolve() : reject(new Error("burn captions fail")))); }); const voice = path.join(outDir, "audio", "voiceover.mp3"); const music = path.join(outDir, "audio", "music.mp3"); const output = path.join(outDir, "output.mp4"); const exists = async (p) => !!(await fs.stat(p).catch(() => false)); const haveVO = await exists(voice); const haveBG = await exists(music); if (haveVO && haveBG) { await new Promise((resolve, reject) => { const args = [ "-nostdin", "-y", "-i", withCaptions, "-i", voice, "-i", music, "-filter_complex", "[1:a]aformat=channel_layouts=stereo,volume=1.0[vo];[2:a]aformat=channel_layouts=stereo,compand=gain=-2[bg];[bg][vo]sidechaincompress=threshold=0.05:ratio=8:attack=5:release=300[ducked];[ducked]volume=0.5[mix]", "-map", "0:v", "-map", "[mix]", "-c:v", "libx264", "-profile:v", "high", "-level", "4.1", "-pix_fmt", "yuv420p", "-r", String(FPS), "-c:a", "aac", "-b:a", "192k", "-shortest", output, ]; const p = spawn("ffmpeg", args, { stdio: "inherit" }); p.on("error", reject); p.on("close", (code) => (code === 0 ? resolve() : reject(new Error("audio mix fail")))); }); } else if (haveVO) { await new Promise((resolve, reject) => { const p = spawn( "ffmpeg", [ "-nostdin", "-y", "-i", withCaptions, "-i", voice, "-map", "0:v", "-map", "1:a", "-c:v", "libx264", "-pix_fmt", "yuv420p", "-r", String(FPS), "-c:a", "aac", "-b:a", "192k", "-shortest", output, ], { stdio: "inherit" } ); p.on("error", reject); p.on("close", (code) => (code === 0 ? resolve() : reject(new Error("mux VO fail")))); }); } else if (haveBG) { await new Promise((resolve, reject) => { const p = spawn( "ffmpeg", [ "-nostdin", "-y", "-i", withCaptions, "-i", music, "-map", "0:v", "-map", "1:a", "-c:v", "libx264", "-pix_fmt", "yuv420p", "-r", String(FPS), "-c:a", "aac", "-b:a", "192k", "-shortest", output, ], { stdio: "inherit" } ); p.on("error", reject); p.on("close", (code) => (code === 0 ? resolve() : reject(new Error("mux music fail")))); }); } else { await fs.copyFile(withCaptions, output); } console.log(`\n✅ Rendered ${output}`); } // ---------- Public API ---------- export async function run(topic, options = {}) { if (!topic || typeof topic !== "string") { throw new Error('Missing required "topic"'); } const userCfg = await loadUserConfig(); // Precedence: env > user config > defaults const cfg = { TEXT_MODEL: process.env.TEXT_MODEL || userCfg.TEXT_MODEL || DEFAULTS.TEXT_MODEL, IMAGE_MODEL: process.env.IMAGE_MODEL || userCfg.IMAGE_MODEL || DEFAULTS.IMAGE_MODEL, TTS_MODEL: process.env.TTS_MODEL || userCfg.TTS_MODEL || DEFAULTS.TTS_MODEL, TTS_VOICE: process.env.TTS_VOICE || userCfg.TTS_VOICE || DEFAULTS.TTS_VOICE, VIDEO_SEC: parseInt(process.env.VIDEO_SEC || userCfg.VIDEO_SEC || DEFAULTS.VIDEO_SEC, 10), SCENES_COUNT: parseInt(process.env.SCENES_COUNT || userCfg.SCENES_COUNT || DEFAULTS.SCENES_COUNT, 10), ELEVENLABS_API_KEY: process.env.ELEVENLABS_API_KEY || userCfg.ELEVENLABS_API_KEY, }; const dryRun = options.dryRun === true; // Gender -> voice mapping (flags override env/default) if (options.gender === "male") cfg.TTS_VOICE = "alloy"; if (options.gender === "female") cfg.TTS_VOICE = "luna"; // Image style selection (default cartoon) const imageStyle = options.style || "cartoon"; // Progress setup const onProgress = typeof options.onProgress === "function" ? options.onProgress : null; const orientations = [ { name: "vertical", WIDTH: 1080, HEIGHT: 1920 }, { name: "horizontal", WIDTH: 1920, HEIGHT: 1080 }, ]; const ffmpegAvailable = !dryRun && (await hasFfmpeg()); let total = 0; total += 1; // script total += 1; // tts total += 3; // root files: script.json, voiceover.txt, README.md for (const o of orientations) { total += cfg.SCENES_COUNT; // images total += 1; // captions total += 1; // storyboard if (ffmpegAvailable) total += 1; // render } let current = 0; const tick = (message) => { current++; onProgress && onProgress({ current, total, message }); }; const OPENAI_API_KEY = process.env.OPENAI_API_KEY || userCfg.OPENAI_API_KEY || ""; if (!OPENAI_API_KEY && !dryRun) { throw new Error("Missing OPENAI_API_KEY. Set environment variable or run 'viral setup'."); } const client = dryRun ? null : new OpenAI({ apiKey: OPENAI_API_KEY }); const slug = slugify(topic); const outDir = path.join(process.cwd(), "build", slug); const audioDir = path.join(outDir, "audio"); await ensureDir(audioDir); const plan = await generateScript({ topic, client, cfg, dryRun }); tick("Generated script"); // Override ttsStyle in saved metadata when gender flag provided if (options.gender) { plan.ttsStyle = `${options.gender}, smooth, educational`; } const voText = [plan.hook, ...plan.sections.map((s) => s.text), plan.disclaimer].filter(Boolean).join("\n"); await writeJSON(path.join(outDir, "script.json"), plan); tick("Wrote script.json"); await fs.writeFile(path.join(outDir, "voiceover.txt"), voText, "utf8"); tick("Wrote voiceover.txt"); const perScene = Math.round(cfg.VIDEO_SEC / cfg.SCENES_COUNT); // Synthesize TTS once at the root (reused for both orientations) await synthesizeTTS({ text: voText, outMp3: path.join(audioDir, "voiceover.mp3"), client, cfg, dryRun, }); tick("Synthesized voiceover"); // Write a per-topic README at the root describing both outputs const readme = `# Video kit for: ${topic} - Orientations: vertical (1080x1920) and horizontal (1920x1080) - Scenes per orientation: ${cfg.SCENES_COUNT} PNGs in <orientation>/scenes/ - Voiceover: audio/voiceover.mp3 (root), copied into each <orientation>/audio/ - Captions: <orientation>/captions.ass - Storyboard: <orientation>/storyboard.csv - Duration: ~${cfg.VIDEO_SEC}s, ${perScene}s per scene - Image style: ${imageStyle} ## Render If ffmpeg is installed, this CLI renders <orientation>/output.mp4 per orientation. - macOS: brew install ffmpeg - Ubuntu: sudo apt-get update && sudo apt-get install -y ffmpeg `; await fs.writeFile(path.join(outDir, "README.md"), readme, "utf8"); tick("Wrote per-topic README"); // Prepare shared caption cues once const cues = splitForCaptions(voText, cfg.VIDEO_SEC); for (const o of orientations) { const ocfg = { ...cfg, WIDTH: o.WIDTH, HEIGHT: o.HEIGHT }; const oDir = path.join(outDir, o.name); const scenesDir = path.join(oDir, "scenes"); const oAudioDir = path.join(oDir, "audio"); await ensureDir(scenesDir); await ensureDir(oAudioDir); // Copy synthesized VO (and optional BG music) into each orientation folder try { await fs.copyFile(path.join(audioDir, "voiceover.mp3"), path.join(oAudioDir, "voiceover.mp3")); } catch {} try { await fs.copyFile(path.join(audioDir, "music.mp3"), path.join(oAudioDir, "music.mp3")); } catch {} const sceneFiles = []; for (let i = 0; i < cfg.SCENES_COUNT; i++) { const prompt = plan.imagePrompts[i] || `${topic}, ${imageStyle === "realistic" ? "photorealistic" : imageStyle === "ai-generated" ? "AI-generated" : "stylized cartoon"} ${o.name} frame, ${ocfg.WIDTH}x${ocfg.HEIGHT}.`; const name = `scene${String(i + 1).padStart(2, "0")}.png`; const outPng = path.join(scenesDir, name); await generateImage({ promptText: prompt, outPng, client, cfg: ocfg, dryRun, imageStyle }); sceneFiles.push(outPng); tick(`Generated image ${i + 1}/${cfg.SCENES_COUNT} (${o.name})`); } const assText = toAss(cues, ocfg); await fs.writeFile(path.join(oDir, "captions.ass"), assText, "utf8"); tick(`Wrote captions (${o.name})`); const storyboard = toStoryboard(sceneFiles, perScene); await fs.writeFile(path.join(oDir, "storyboard.csv"), storyboard, "utf8"); tick(`Wrote storyboard (${o.name})`); if (ffmpegAvailable) { await renderVideo(oDir, perScene, ocfg); tick(`Rendered video (${o.name})`); } else if (!dryRun) { console.log(`⚠️ ffmpeg not found. Assets are ready in: ${oDir}`); } } return outDir; }