diff --git a/.gitignore b/.gitignore index 2d6a637..4807c38 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,10 @@ # Remotion render output out/ +# Rhubarb lip sync intermediate outputs +voicevox-remotion-template/public/lipsync/raw/ +voicevox-remotion-template/src/generated/lipsync/*.mouth.json + # Build artifacts and caches dist/ build/ diff --git a/voicevox-remotion-template/.eslintrc.cjs b/voicevox-remotion-template/.eslintrc.cjs index 0217ed1..8d47931 100644 --- a/voicevox-remotion-template/.eslintrc.cjs +++ b/voicevox-remotion-template/.eslintrc.cjs @@ -1,6 +1,11 @@ module.exports = { root: true, extends: ["@remotion"], + ignorePatterns: [ + "public/image/*-rhubarb-mouths/rhubarb-map.js", + "public/lipsync/", + "src/generated/lipsync/*.json", + ], env: { node: true, }, diff --git a/voicevox-remotion-template/package.json b/voicevox-remotion-template/package.json index f41b620..0d331e4 100644 --- a/voicevox-remotion-template/package.json +++ b/voicevox-remotion-template/package.json @@ -7,6 +7,8 @@ "start": "remotion preview", "render": "remotion render", "lint": "eslint .", + "lipsync:generate": "node scripts/generate-lipsync.js", + "test:lipsync": "node --test scripts/lipsync-utils.test.js", "voice:generate": "node scripts/voicevox-generate.js", "voice:generate:pizza-kiln": "node scripts/voicevox-generate.js --script src/data/pizza-kiln/script.ts --output public/audio/pizza-kiln/lines --manifest src/data/pizza-kiln/voicevox-manifest.json" }, diff --git a/voicevox-remotion-template/scripts/generate-lipsync.js b/voicevox-remotion-template/scripts/generate-lipsync.js new file mode 100644 index 0000000..d44ebb1 --- /dev/null +++ b/voicevox-remotion-template/scripts/generate-lipsync.js @@ -0,0 +1,299 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import {spawn} from "node:child_process"; +import {fileURLToPath} from "node:url"; +import {normalizeRhubarbJson} from "./lipsync-utils.js"; + +const projectRoot = path.resolve( + fileURLToPath(new URL("..", import.meta.url)) +); +const publicDir = path.join(projectRoot, "public"); +const generatedDir = path.join(projectRoot, "src/generated/lipsync"); +const rawDir = path.join(publicDir, "lipsync/raw"); + +const DEFAULT_SOURCE_MANIFESTS = [ + "src/data/voicevox-manifest.json", + "src/data/pizza-kiln/voicevox-manifest.json", +]; + +const resolveProjectPath = (value) => + path.isAbsolute(value) ? value : path.resolve(projectRoot, value); + +const toProjectRelative = (targetPath) => + path.relative(projectRoot, targetPath).split(path.sep).join("/"); + +const toPublicRelative = (targetPath) => { + const relativePath = path.relative(publicDir, targetPath); + if (relativePath.startsWith("..") || path.isAbsolute(relativePath)) { + return toProjectRelative(targetPath); + } + + return relativePath.split(path.sep).join("/"); +}; + +const pathExists = async (targetPath) => { + try { + await fs.access(targetPath); + return true; + } catch { + return false; + } +}; + +const parseArgs = () => { + const values = { + out: undefined, + rawOut: undefined, + manifest: "src/generated/lipsync/manifest.json", + }; + const audioPaths = []; + const args = process.argv.slice(2); + + for (let index = 0; index < args.length; index += 1) { + const arg = args[index]; + if (!arg.startsWith("--")) { + audioPaths.push(arg); + continue; + } + + const key = arg.slice(2); + if (!["out", "raw-out", "manifest"].includes(key)) { + throw new Error(`Unknown option "${arg}".`); + } + + const value = args[index + 1]; + if (!value || value.startsWith("--")) { + throw new Error(`Option "${arg}" needs a value.`); + } + + if (key === "raw-out") { + values.rawOut = value; + } else { + values[key] = value; + } + index += 1; + } + + if (audioPaths.length > 1) { + throw new Error("Only one audio path can be specified."); + } + if (!audioPaths[0] && (values.out || values.rawOut)) { + throw new Error("--out and --raw-out can only be used with one audio path."); + } + + return { + audioPath: audioPaths[0], + outPath: values.out ? resolveProjectPath(values.out) : undefined, + rawOutPath: values.rawOut ? resolveProjectPath(values.rawOut) : undefined, + manifestPath: resolveProjectPath(values.manifest), + }; +}; + +const executableNames = () => + process.platform === "win32" + ? ["rhubarb.exe", "rhubarb.cmd", "rhubarb"] + : ["rhubarb"]; + +const findRhubarbBin = async () => { + if (process.env.RHUBARB_BIN) { + const envPath = resolveProjectPath(process.env.RHUBARB_BIN); + if (await pathExists(envPath)) { + return envPath; + } + throw new Error(`RHUBARB_BIN was set, but not found: ${envPath}`); + } + + const candidates = []; + for (const name of executableNames()) { + candidates.push(path.join(projectRoot, "node_modules/.bin", name)); + candidates.push(path.join(projectRoot, "tools/rhubarb", name)); + candidates.push(path.join(projectRoot, "vendor/rhubarb", name)); + } + + for (const candidate of candidates) { + if (await pathExists(candidate)) { + return candidate; + } + } + + for (const directory of (process.env.PATH ?? "").split(path.delimiter)) { + for (const name of executableNames()) { + const candidate = path.join(directory, name); + if (await pathExists(candidate)) { + return candidate; + } + } + } + + throw new Error( + [ + "Rhubarb Lip Sync CLI was not found.", + "Set RHUBARB_BIN to the Rhubarb executable path to use any installed CLI.", + "Executable names differ by OS, for example rhubarb, rhubarb.exe, or rhubarb.cmd.", + "When using a Dev Container, install the Linux Rhubarb binary and point RHUBARB_BIN to it.", + ].join("\n") + ); +}; + +const runRhubarb = (rhubarbBin, inputPath, rawOutputPath) => + new Promise((resolve, reject) => { + const args = [ + "--recognizer", + "phonetic", + "--exportFormat", + "json", + "--extendedShapes", + "X", + "--output", + rawOutputPath, + inputPath, + ]; + const child = spawn(rhubarbBin, args, {cwd: projectRoot}); + let stdout = ""; + let stderr = ""; + + child.stdout.on("data", (chunk) => { + stdout += chunk; + }); + child.stderr.on("data", (chunk) => { + stderr += chunk; + }); + child.on("error", reject); + child.on("close", (code) => { + if (code === 0) { + resolve({stdout, stderr}); + return; + } + + reject( + new Error( + `Rhubarb exited with code ${code} for ${toProjectRelative(inputPath)}.\n${stderr || stdout}` + ) + ); + }); + }); + +const loadJson = async (targetPath) => + JSON.parse(await fs.readFile(targetPath, "utf8")); + +const loadExistingGeneratedManifest = async (manifestPath) => { + if (!(await pathExists(manifestPath))) { + return {version: 1, timelines: {}}; + } + + const manifest = await loadJson(manifestPath); + if (manifest?.version !== 1 || typeof manifest.timelines !== "object") { + return {version: 1, timelines: {}}; + } + + return manifest; +}; + +const taskForAudioPath = async ({audioPath, outPath, rawOutPath}) => { + const inputPath = resolveProjectPath(audioPath); + if (!(await pathExists(inputPath))) { + throw new Error(`Input audio file was not found: ${audioPath}`); + } + + const id = path.basename(inputPath, path.extname(inputPath)); + + return { + id, + inputPath, + sourceAudio: toPublicRelative(inputPath), + rawOutputPath: rawOutPath ?? path.join(rawDir, `${id}.rhubarb.json`), + outputPath: outPath ?? path.join(generatedDir, `${id}.mouth.json`), + }; +}; + +const tasksForVoicevoxManifest = async (manifestPath) => { + if (!(await pathExists(manifestPath))) { + return []; + } + + const entries = await loadJson(manifestPath); + if (!Array.isArray(entries)) { + throw new Error(`${toProjectRelative(manifestPath)} must be a JSON array.`); + } + + return Promise.all( + entries.map(async (entry) => { + if (!entry?.id || !entry?.file) { + throw new Error( + `${toProjectRelative(manifestPath)} entries need id and file.` + ); + } + + const inputPath = path.join(publicDir, entry.file); + if (!(await pathExists(inputPath))) { + throw new Error(`Input audio file was not found: ${entry.file}`); + } + + return { + id: entry.id, + inputPath, + sourceAudio: entry.file, + rawOutputPath: path.join(rawDir, `${entry.id}.rhubarb.json`), + outputPath: path.join(generatedDir, `${entry.id}.mouth.json`), + }; + }) + ); +}; + +const defaultTasks = async () => { + const taskGroups = await Promise.all( + DEFAULT_SOURCE_MANIFESTS.map((manifest) => + tasksForVoicevoxManifest(resolveProjectPath(manifest)) + ) + ); + const tasks = taskGroups.flat(); + if (tasks.length === 0) { + throw new Error("No VOICEVOX manifest entries were found."); + } + + return tasks; +}; + +const writeJson = async (targetPath, value) => { + await fs.mkdir(path.dirname(targetPath), {recursive: true}); + await fs.writeFile(targetPath, `${JSON.stringify(value, null, 2)}\n`); +}; + +const generateTask = async (rhubarbBin, task) => { + await fs.mkdir(path.dirname(task.rawOutputPath), {recursive: true}); + await runRhubarb(rhubarbBin, task.inputPath, task.rawOutputPath); + + const rawJson = await loadJson(task.rawOutputPath); + const {timeline, warnings} = normalizeRhubarbJson(rawJson, { + audio: task.sourceAudio, + }); + + warnings.forEach((warning) => { + console.warn(`${task.id}: ${warning}`); + }); + await writeJson(task.outputPath, timeline); + + console.log( + `Wrote ${toProjectRelative(task.outputPath)} from ${toProjectRelative( + task.inputPath + )}` + ); + + return timeline; +}; + +const {audioPath, outPath, rawOutPath, manifestPath} = parseArgs(); +const rhubarbBin = await findRhubarbBin(); +const tasks = audioPath + ? [await taskForAudioPath({audioPath, outPath, rawOutPath})] + : await defaultTasks(); +const generatedManifest = audioPath + ? await loadExistingGeneratedManifest(manifestPath) + : {version: 1, timelines: {}}; + +for (const task of tasks) { + generatedManifest.timelines[task.id] = await generateTask(rhubarbBin, task); +} + +await writeJson(manifestPath, generatedManifest); +console.log(`Updated ${toProjectRelative(manifestPath)}`); diff --git a/voicevox-remotion-template/scripts/lipsync-utils.js b/voicevox-remotion-template/scripts/lipsync-utils.js new file mode 100644 index 0000000..af61e4f --- /dev/null +++ b/voicevox-remotion-template/scripts/lipsync-utils.js @@ -0,0 +1,82 @@ +export const RHUBARB_TO_JA_MOUTH = Object.freeze({ + X: "rest", + A: "closed", + B: "i", + C: "e", + D: "a", + E: "o", + F: "u", + G: "i", + H: "e", +}); + +export const JAPANESE_MOUTH_SHAPES = Object.freeze([ + "a", + "i", + "u", + "e", + "o", + "closed", + "rest", +]); + +export const mapRhubarbMouthToJapanese = (source, warnings = []) => { + const mouth = RHUBARB_TO_JA_MOUTH[source]; + if (mouth) { + return mouth; + } + + warnings.push(`Unknown Rhubarb mouth shape "${source}", using "rest".`); + return "rest"; +}; + +const assertFiniteNumber = (value, fieldName, index) => { + if (!Number.isFinite(value)) { + throw new Error(`mouthCues[${index}].${fieldName} must be a number.`); + } +}; + +export const normalizeRhubarbJson = ( + rhubarbJson, + {audio, recognizer = "phonetic"} = {} +) => { + if (!rhubarbJson || !Array.isArray(rhubarbJson.mouthCues)) { + throw new Error("Rhubarb JSON must contain a mouthCues array."); + } + + const warnings = []; + const cues = rhubarbJson.mouthCues.map((cue, index) => { + const start = Number(cue?.start); + const end = Number(cue?.end); + assertFiniteNumber(start, "start", index); + assertFiniteNumber(end, "end", index); + + const source = typeof cue?.value === "string" ? cue.value : String(cue?.value); + + return { + start, + end, + mouth: mapRhubarbMouthToJapanese(source, warnings), + source, + }; + }); + + const metadataDuration = Number(rhubarbJson.metadata?.duration); + const duration = Number.isFinite(metadataDuration) + ? metadataDuration + : cues.at(-1)?.end ?? 0; + + return { + timeline: { + version: 1, + source: { + audio: audio ?? rhubarbJson.metadata?.soundFile ?? "", + engine: "rhubarb-lip-sync", + recognizer, + }, + duration, + cues, + }, + warnings, + }; +}; diff --git a/voicevox-remotion-template/scripts/lipsync-utils.test.js b/voicevox-remotion-template/scripts/lipsync-utils.test.js new file mode 100644 index 0000000..133bb5d --- /dev/null +++ b/voicevox-remotion-template/scripts/lipsync-utils.test.js @@ -0,0 +1,54 @@ +import assert from "node:assert/strict"; +import {test} from "node:test"; +import {normalizeRhubarbJson} from "./lipsync-utils.js"; + +test("maps Rhubarb mouth shapes to Japanese mouth shapes", () => { + const {timeline} = normalizeRhubarbJson( + { + metadata: {duration: 1.2}, + mouthCues: [ + {start: 0, end: 0.1, value: "X"}, + {start: 0.1, end: 0.2, value: "D"}, + {start: 0.2, end: 0.3, value: "F"}, + ], + }, + {audio: "audio/example.wav"} + ); + + assert.deepEqual( + timeline.cues.map((cue) => cue.mouth), + ["rest", "a", "u"] + ); +}); + +test("uses rest for unknown shapes and reports a warning", () => { + const {timeline, warnings} = normalizeRhubarbJson( + { + mouthCues: [{start: 0, end: 0.1, value: "Z"}], + }, + {audio: "audio/example.wav"} + ); + + assert.equal(timeline.cues[0].mouth, "rest"); + assert.match(warnings[0], /Unknown Rhubarb mouth shape "Z"/); +}); + +test("uses metadata duration when available", () => { + const {timeline} = normalizeRhubarbJson({ + metadata: {duration: 2.5}, + mouthCues: [{start: 0, end: 0.1, value: "X"}], + }); + + assert.equal(timeline.duration, 2.5); +}); + +test("falls back to the last cue end for duration", () => { + const {timeline} = normalizeRhubarbJson({ + mouthCues: [ + {start: 0, end: 0.4, value: "X"}, + {start: 0.4, end: 0.8, value: "D"}, + ], + }); + + assert.equal(timeline.duration, 0.8); +}); diff --git a/voicevox-remotion-template/src/avatar-animations.ts b/voicevox-remotion-template/src/avatar-animations.ts index 3fcadf5..22a42f1 100644 --- a/voicevox-remotion-template/src/avatar-animations.ts +++ b/voicevox-remotion-template/src/avatar-animations.ts @@ -19,8 +19,16 @@ export type IdleAvatarAnimationType = keyof typeof idleAvatarAnimations; -export const speakingAvatarAnimations = { +type SpeakingAvatarAnimationMap = Readonly<{ + none: AvatarAnimation; + rhubarbLipSync: AvatarAnimation; + gentleBob: AvatarAnimation; + quickHop: AvatarAnimation; +}>; + +export const speakingAvatarAnimations: SpeakingAvatarAnimationMap = { none: () => 0, + rhubarbLipSync: () => 0, gentleBob: idleAvatarAnimations.gentleBob, quickHop: ({frame, fps}) => { const cycleFrames = Math.max(1, Math.round(fps * 0.25)); @@ -28,6 +36,6 @@ return -Math.sin(progress * Math.PI) * 7; }, -} satisfies Record; +}; export type SpeakingAvatarAnimationType = keyof typeof speakingAvatarAnimations; diff --git a/voicevox-remotion-template/src/data/pizza-kiln/script.ts b/voicevox-remotion-template/src/data/pizza-kiln/script.ts index 158a5e3..5670dc7 100644 --- a/voicevox-remotion-template/src/data/pizza-kiln/script.ts +++ b/voicevox-remotion-template/src/data/pizza-kiln/script.ts @@ -1,3 +1,5 @@ +import type {SpeakingAvatarAnimationType} from "../../avatar-animations"; + export type VoicevoxVoice = Readonly<{ speakerName: string; styleName: string; @@ -7,8 +9,11 @@ displayName: string; voicevox: VoicevoxVoice; avatar: Readonly<{ + kind: "sayo"; accentColor: string; imagePath: string; + mouthImageDir?: string; + speakingAnimationType?: SpeakingAvatarAnimationType; }>; }>; @@ -20,8 +25,10 @@ styleName: "γƒŽγƒΌγƒžγƒ«", }, avatar: { + kind: "sayo", accentColor: "#6b5f83", imagePath: "image/sayo-standee-base.png", + speakingAnimationType: "rhubarbLipSync", }, }, } as const satisfies Record; diff --git a/voicevox-remotion-template/src/data/script.ts b/voicevox-remotion-template/src/data/script.ts index fdc83fd..a745d5f 100644 --- a/voicevox-remotion-template/src/data/script.ts +++ b/voicevox-remotion-template/src/data/script.ts @@ -12,6 +12,7 @@ kind: "zundamon" | "sayo"; accentColor: string; imagePath?: string; + mouthImageDir?: string; imageLayout?: Readonly<{ width?: number; maxHeight?: number; @@ -48,7 +49,7 @@ }, nameplatePosition: "none", idleAnimationType: "none", - speakingAnimationType: "quickHop", + speakingAnimationType: "rhubarbLipSync", }, }, sayo: { @@ -68,7 +69,7 @@ }, nameplatePosition: "none", idleAnimationType: "none", - speakingAnimationType: "quickHop", + speakingAnimationType: "rhubarbLipSync", }, }, } as const satisfies Record; diff --git a/voicevox-remotion-template/src/lipsync/LipSyncedStandeeImage.tsx b/voicevox-remotion-template/src/lipsync/LipSyncedStandeeImage.tsx new file mode 100644 index 0000000..81f2271 --- /dev/null +++ b/voicevox-remotion-template/src/lipsync/LipSyncedStandeeImage.tsx @@ -0,0 +1,64 @@ +import React from "react"; +import {Img, staticFile} from "remotion"; +import type {JapaneseMouthShape} from "./types"; + +type LipSyncedStandeeImageProps = Readonly<{ + imagePath: string; + mouthImageDir: string; + mouth: JapaneseMouthShape; + width: number | string; + maxHeight: number | string; + height?: number | string; + transform?: string; + filter?: string; +}>; + +export const defaultMouthImageDir = (avatarKind: string) => + `image/${avatarKind}-rhubarb-mouths`; + +export const LipSyncedStandeeImage: React.FC = ({ + imagePath, + mouthImageDir, + mouth, + width, + maxHeight, + height, + transform, + filter, +}) => { + return ( +
+ + +
+ ); +}; diff --git a/voicevox-remotion-template/src/lipsync/getMouthAtTime.ts b/voicevox-remotion-template/src/lipsync/getMouthAtTime.ts new file mode 100644 index 0000000..a818ac5 --- /dev/null +++ b/voicevox-remotion-template/src/lipsync/getMouthAtTime.ts @@ -0,0 +1,8 @@ +import type {JapaneseMouthShape, MouthTimeline} from "./types"; + +export const getMouthAtTime = ( + timeline: MouthTimeline | undefined, + seconds: number +): JapaneseMouthShape => + timeline?.cues.find((cue) => seconds >= cue.start && seconds < cue.end) + ?.mouth ?? "rest"; diff --git a/voicevox-remotion-template/src/lipsync/manifest.ts b/voicevox-remotion-template/src/lipsync/manifest.ts new file mode 100644 index 0000000..f0a4a7b --- /dev/null +++ b/voicevox-remotion-template/src/lipsync/manifest.ts @@ -0,0 +1,21 @@ +import generatedManifest from "../generated/lipsync/manifest.json"; +import {getMouthAtTime} from "./getMouthAtTime"; +import type { + JapaneseMouthShape, + MouthTimeline, + MouthTimelineManifest, +} from "./types"; + +const lipsyncManifest = generatedManifest as MouthTimelineManifest; + +export const mouthTimelineForSpeech = ( + speechId: string | undefined +): MouthTimeline | undefined => + speechId ? lipsyncManifest.timelines[speechId] : undefined; + +export const getMouthForSpeechFrame = ( + speechId: string | undefined, + frame: number, + fps: number +): JapaneseMouthShape => + getMouthAtTime(mouthTimelineForSpeech(speechId), frame / fps); diff --git a/voicevox-remotion-template/src/lipsync/rhubarb-map.ts b/voicevox-remotion-template/src/lipsync/rhubarb-map.ts new file mode 100644 index 0000000..dbce856 --- /dev/null +++ b/voicevox-remotion-template/src/lipsync/rhubarb-map.ts @@ -0,0 +1,22 @@ +import type {JapaneseMouthShape} from "./types"; + +export const RHUBARB_TO_JA_MOUTH = { + X: "rest", + A: "closed", + B: "i", + C: "e", + D: "a", + E: "o", + F: "u", + G: "i", + H: "e", +} as const satisfies Record; + +export const rhubarbMouthToJapanese = ( + source: string +): JapaneseMouthShape => { + const mouthMap: Readonly> = + RHUBARB_TO_JA_MOUTH; + + return mouthMap[source] ?? "rest"; +}; diff --git a/voicevox-remotion-template/src/lipsync/types.ts b/voicevox-remotion-template/src/lipsync/types.ts new file mode 100644 index 0000000..980ef73 --- /dev/null +++ b/voicevox-remotion-template/src/lipsync/types.ts @@ -0,0 +1,31 @@ +export type JapaneseMouthShape = + | "a" + | "i" + | "u" + | "e" + | "o" + | "closed" + | "rest"; + +export type MouthCue = Readonly<{ + start: number; + end: number; + mouth: JapaneseMouthShape; + source: string; +}>; + +export type MouthTimeline = Readonly<{ + version: 1; + source: Readonly<{ + audio: string; + engine: "rhubarb-lip-sync"; + recognizer: "phonetic"; + }>; + duration: number; + cues: MouthCue[]; +}>; + +export type MouthTimelineManifest = Readonly<{ + version: 1; + timelines: Record; +}>; diff --git a/voicevox-remotion-template/src/pizza-kiln-composition.tsx b/voicevox-remotion-template/src/pizza-kiln-composition.tsx index 5c1f28c..722e762 100644 --- a/voicevox-remotion-template/src/pizza-kiln-composition.tsx +++ b/voicevox-remotion-template/src/pizza-kiln-composition.tsx @@ -2,7 +2,6 @@ import {Audio, Video} from "@remotion/media"; import { AbsoluteFill, - Img, interpolate, Sequence, spring, @@ -17,8 +16,19 @@ PIZZA_KILN_GAP_FRAMES, PIZZA_KILN_VIDEO_FRAMES, } from "./data/pizza-kiln/timing"; -import {characters, timeline, type SpeechEvent} from "./data/pizza-kiln/script"; +import { + characters, + timeline, + type CharacterDefinition, + type SpeechEvent, +} from "./data/pizza-kiln/script"; import {roundedFontFamily} from "./fonts"; +import {speakingAvatarAnimations} from "./avatar-animations"; +import {getMouthForSpeechFrame} from "./lipsync/manifest"; +import { + defaultMouthImageDir, + LipSyncedStandeeImage, +} from "./lipsync/LipSyncedStandeeImage"; const BACKGROUND_VIDEO_PATH = "video/pizza-kiln-background.mp4"; @@ -98,12 +108,23 @@ speaking: boolean; localFrame: number; fps: number; + speechId?: string; }> -> = ({mode, speaking, localFrame, fps}) => { - const cycleFrames = Math.max(1, Math.round(fps * 0.25)); - const progress = (localFrame % cycleFrames) / cycleFrames; - const speakingHop = -Math.sin(progress * Math.PI) * 7; - const translateY = speaking ? speakingHop : 0; +> = ({mode, speaking, localFrame, fps, speechId}) => { + const {avatar}: {avatar: CharacterDefinition["avatar"]} = characters.sayo; + const speakingAnimationType = avatar.speakingAnimationType ?? "none"; + const translateY = speaking + ? speakingAvatarAnimations[speakingAnimationType]({ + frame: localFrame, + fps, + focused: true, + hasMultipleCharacters: false, + }) + : 0; + const mouth = + speaking && speakingAnimationType === "rhubarbLipSync" + ? getMouthForSpeechFrame(speechId, localFrame, fps) + : "rest"; const isCorner = mode === "corner"; return ( @@ -121,16 +142,20 @@ zIndex: 3, }} > - ); @@ -185,8 +210,9 @@ const isVideoVisible = frame >= videoFrom && frame < videoFrom + PIZZA_KILN_VIDEO_FRAMES; const isOutro = frame >= outroFrom; - const isSpeaking = frame < introFrames || isOutro; - const speechLocalFrame = isOutro ? frame - outroFrom : frame; + const activeSpeech = + frame < introFrames ? introSpeech : isOutro ? outroSpeech : undefined; + const speechLocalFrame = activeSpeech === outroSpeech ? frame - outroFrom : frame; return ( @@ -208,9 +234,10 @@ diff --git a/voicevox-remotion-template/src/yukkuri-composition.tsx b/voicevox-remotion-template/src/yukkuri-composition.tsx index 47b2ea7..8a1b467 100644 --- a/voicevox-remotion-template/src/yukkuri-composition.tsx +++ b/voicevox-remotion-template/src/yukkuri-composition.tsx @@ -2,7 +2,6 @@ import {Audio} from "@remotion/media"; import { AbsoluteFill, - Img, interpolate, Sequence, spring, @@ -29,6 +28,11 @@ hasAudioForSpeech, } from "./data/timing"; import {roundedFontFamily} from "./fonts"; +import {getMouthForSpeechFrame} from "./lipsync/manifest"; +import { + defaultMouthImageDir, + LipSyncedStandeeImage, +} from "./lipsync/LipSyncedStandeeImage"; type ScheduledTimelineEvent = Readonly<{ event: TimelineEvent; @@ -85,6 +89,23 @@ extrapolateRight: "clamp", } as const; +const mouthForSpeechFrame = ({ + isSpeaking, + speakingAnimationType, + speechId, + speakingLocalFrame, + fps, +}: Readonly<{ + isSpeaking: boolean; + speakingAnimationType: string; + speechId?: string; + speakingLocalFrame: number; + fps: number; +}>) => + isSpeaking && speakingAnimationType === "rhubarbLipSync" + ? getMouthForSpeechFrame(speechId, speakingLocalFrame, fps) + : "rest"; + const Title: React.FC> = ({progress}) => { const opacity = interpolate(progress, [0, 1], [0, 1], clampInterpolation); const translateY = interpolate(progress, [0, 1], [-30, 0], clampInterpolation); @@ -388,8 +409,19 @@ frame: number; fps: number; isSpeaking: boolean; + speakingSpeechId?: string; + speakingLocalFrame: number; }> -> = ({characterId, focused, hasMultipleCharacters, frame, fps, isSpeaking}) => { +> = ({ + characterId, + focused, + hasMultipleCharacters, + frame, + fps, + isSpeaking, + speakingSpeechId, + speakingLocalFrame, +}) => { const character = characters[characterId]; const {avatar}: {avatar: AvatarDefinition} = character; const scale = hasMultipleCharacters ? 0.88 : focused ? 1.05 : 1; @@ -414,6 +446,13 @@ const imageTransform = `translateY(${imageTranslateY}px) scaleX(${imageScaleX})`; const nameplatePosition = avatar.nameplatePosition ?? "bottom"; const showNameplate = nameplatePosition !== "none"; + const mouth = mouthForSpeechFrame({ + isSpeaking, + speakingAnimationType, + speechId: speakingSpeechId, + speakingLocalFrame, + fps, + }); const nameplate = (
{showNameplate && nameplatePosition === "top" ? nameplate : null} {avatar.imagePath ? ( - ) : avatar.kind === "sayo" ? (
-> = ({visibleCharacters, focusedCharacter, speakingCharacter, frame, fps}) => { +> = ({ + visibleCharacters, + focusedCharacter, + speakingCharacter, + speakingSpeechId, + speakingLocalFrame, + frame, + fps, +}) => { const hasMultipleCharacters = visibleCharacters.length > 1; return ( @@ -510,6 +556,8 @@ frame={frame} fps={fps} isSpeaking={speakingCharacter === characterId} + speakingSpeechId={speakingSpeechId} + speakingLocalFrame={speakingLocalFrame} /> ))}
@@ -583,10 +631,12 @@ fps, config: {damping: 18, mass: 0.6}, }); - const speakingCharacter = + const activeSpeech = isInsideActiveSegment && activeSegment.event.type === "say" - ? activeSegment.event.character + ? activeSegment.event : undefined; + const speakingCharacter = activeSpeech?.character; + const speakingLocalFrame = activeSpeech ? frame - activeSegment.from : 0; const sequences = scheduledEvents.map((scheduledEvent, index) => (