diff --git a/voicevox-remotion-template/README.md b/voicevox-remotion-template/README.md index 5d97048..be067d0 100644 --- a/voicevox-remotion-template/README.md +++ b/voicevox-remotion-template/README.md @@ -82,9 +82,11 @@ 対象一覧と使い方を表示します。このコマンドだけでは口パクデータを生成しません。 -生成物は、Rhubarb の生 JSON が `public/lipsync/raw/*.rhubarb.json`、 +生成物は、Rhubarb 方式では生 JSON が `public/lipsync/raw/*.rhubarb.json`、 Remotion 用に正規化した JSON が `src/generated/lipsync/*.mouth.json`、 プレビュー時に同期 import する集約 manifest が `src/generated/lipsync/manifest.json` です。 +VOICEVOX query 方式のコンポジションでは、VOICEVOX の `audio_query` に含まれるモーラ長から +直接 `src/generated/lipsync/*.mouth.json` を生成します。 全コンポジションの口パクデータをまとめて生成する場合は、次を実行します。 @@ -128,7 +130,7 @@ Dev Container で使う場合は Linux 版 Rhubarb を配置し、必要なら `RHUBARB_BIN=/usr/local/bin/rhubarb` のように指定してください。 -日本語音声では Rhubarb の `phonetic` recognizer を使います。音声のみからの推定なので、 +既定では Rhubarb の `phonetic` recognizer を使います。音声のみからの推定なので、 日本語の母音完全一致ではなく、動画用に自然に見える口パクを目的にしています。 Rhubarb 口形は次のように丸めます。 @@ -146,6 +148,11 @@ } ``` +`zundamon-rework-standee-demo` は、Rhubarb ではなく VOICEVOX query 方式を使います。 +この方式では VOICEVOX の `audio_query` に含まれる `a` / `i` / `u` / `e` / `o` などの +母音とモーラ長から口形タイムラインを作るため、日本語母音の確認デモに向いています。 +既存コンポジションは従来どおり Rhubarb 方式です。 + ### 6. プレビュー ```bash npm run start @@ -316,7 +323,7 @@ の口形タイムラインに合わせて `mouthImageDir` の画像を切り替えます。 新しい時系列コンポジションでは、`VQChronologicalScenario` の `assetWorkflow` に -VOICEVOX 音声生成と Rhubarb 口パク生成のパスをセットで定義します。口形はコンポジションに +VOICEVOX 音声生成と口パク生成のパスをセットで定義します。口形はコンポジションに 固定値を書かず、`speech.id` をキーに `src/generated/lipsync/manifest.json` から参照します。 ### 4. コンポジション固有の見栄えを調整 diff --git a/voicevox-remotion-template/scripts/generate-lipsync.js b/voicevox-remotion-template/scripts/generate-lipsync.js index 925702c..8a78509 100644 --- a/voicevox-remotion-template/scripts/generate-lipsync.js +++ b/voicevox-remotion-template/scripts/generate-lipsync.js @@ -2,7 +2,7 @@ import path from "node:path"; import {spawn} from "node:child_process"; import {fileURLToPath} from "node:url"; -import {normalizeRhubarbJson} from "./lipsync-utils.js"; +import {normalizeRhubarbJson, normalizeVoicevoxTiming} from "./lipsync-utils.js"; const projectRoot = path.resolve( fileURLToPath(new URL("..", import.meta.url)) @@ -27,6 +27,7 @@ { name: "zundamon-rework-standee-demo", sourceManifest: "src/data/zundamon-rework-standee-demo/voicevox-manifest.json", + engine: "voicevox-query", }, ]; @@ -198,9 +199,10 @@ outPath: undefined, rawOutPath: undefined, manifestPath: resolveProjectPath(options.manifest), - sourceManifestPaths: lipsyncTargets.map(({sourceManifest}) => - resolveProjectPath(sourceManifest) - ), + sourceManifests: lipsyncTargets.map(({sourceManifest, engine}) => ({ + path: resolveProjectPath(sourceManifest), + engine: engine ?? "rhubarb", + })), shouldMergeExistingManifest: false, }; }; @@ -234,7 +236,12 @@ outPath: undefined, rawOutPath: undefined, manifestPath: resolveProjectPath(options.manifest), - sourceManifestPaths: [resolveProjectPath(target.sourceManifest)], + sourceManifests: [ + { + path: resolveProjectPath(target.sourceManifest), + engine: target.engine ?? "rhubarb", + }, + ], shouldMergeExistingManifest: true, }; }; @@ -262,7 +269,10 @@ ? resolveProjectPath(options.rawOut) : undefined, manifestPath: resolveProjectPath(options.manifest), - sourceManifestPaths: options.sourceManifests.map(resolveProjectPath), + sourceManifests: options.sourceManifests.map((sourceManifest) => ({ + path: resolveProjectPath(sourceManifest), + engine: "rhubarb", + })), shouldMergeExistingManifest: true, }; }; @@ -421,6 +431,7 @@ return { id, + engine: "rhubarb", inputPath, sourceAudio: toPublicRelative(inputPath), rawOutputPath: rawOutPath ?? path.join(rawDir, `${id}.rhubarb.json`), @@ -430,8 +441,8 @@ // 用途: VOICEVOX manifest の各音声エントリから、口パク生成タスク群を作る。 // 使用方法: --all、--project、または --source-manifest 指定時の処理で await して呼び出す。 -// オプションや引数詳細: manifestPath は id と file を含む JSON 配列を想定し、file は public 配下から解決する。 -const tasksForVoicevoxManifest = async (manifestPath) => { +// オプションや引数詳細: manifestPath は id と file を含む JSON 配列、engine は rhubarb または voicevox-query を指定する。 +const tasksForVoicevoxManifest = async ({manifestPath, engine = "rhubarb"}) => { if (!(await pathExists(manifestPath))) { return []; } @@ -456,8 +467,11 @@ return { id: entry.id, + engine, inputPath, sourceAudio: entry.file, + durationSeconds: entry.durationSeconds, + voicevoxTiming: entry.voicevoxTiming, rawOutputPath: path.join(rawDir, `${entry.id}.rhubarb.json`), outputPath: path.join(generatedDir, `${entry.id}.mouth.json`), }; @@ -467,10 +481,15 @@ // 用途: 指定された VOICEVOX manifest から、まとめて生成する口パクタスクを集める。 // 使用方法: --all、--project、または --source-manifest 指定時に await して呼び出す。 -// オプションや引数詳細: sourceManifestPaths は1件以上を想定し、読み取れる音声エントリが1件もなければエラーにする。 -const tasksForVoicevoxManifests = async (sourceManifestPaths) => { +// オプションや引数詳細: sourceManifests は path と engine を持つ設定の配列で、読み取れる音声エントリが1件もなければエラーにする。 +const tasksForVoicevoxManifests = async (sourceManifests) => { const taskGroups = await Promise.all( - sourceManifestPaths.map((manifest) => tasksForVoicevoxManifest(manifest)) + sourceManifests.map((manifest) => + tasksForVoicevoxManifest({ + manifestPath: manifest.path, + engine: manifest.engine, + }) + ) ); const tasks = taskGroups.flat(); if (tasks.length === 0) { @@ -489,9 +508,9 @@ }; // 用途: 1件の音声タスクについて Rhubarb 実行、正規化、JSON 保存までを行う。 -// 使用方法: tasks の各要素に対して await し、manifest に格納する timeline を受け取る。 +// 使用方法: task.engine が rhubarb の場合に generateTask から呼び出す。 // オプションや引数詳細: rhubarbBin は Rhubarb CLI、task は id・inputPath・sourceAudio・rawOutputPath・outputPath を含む。 -const generateTask = async (rhubarbBin, task) => { +const generateRhubarbTask = async (rhubarbBin, task) => { await fs.mkdir(path.dirname(task.rawOutputPath), {recursive: true}); await runRhubarb(rhubarbBin, task.inputPath, task.rawOutputPath); @@ -514,6 +533,40 @@ return timeline; }; +// 用途: VOICEVOX audio_query のモーラ長から、音声解析を介さず口形 JSON を生成する。 +// 使用方法: task.engine が voicevox-query の場合に generateTask から呼び出す。 +// オプションや引数詳細: task.voicevoxTiming は voicevox-generate が manifest に保存した軽量タイミング情報を想定する。 +const generateVoicevoxQueryTask = async (task) => { + if (!task.voicevoxTiming) { + throw new Error( + `${task.id} needs voicevoxTiming. Run npm run voice:generate for the project before lipsync generation.` + ); + } + + const timeline = normalizeVoicevoxTiming(task.voicevoxTiming, { + audio: task.sourceAudio, + durationSeconds: task.durationSeconds, + }); + await writeJson(task.outputPath, timeline); + + console.log( + `Wrote ${toProjectRelative(task.outputPath)} from VOICEVOX timing in ${task.id}` + ); + + return timeline; +}; + +// 用途: 1件の口パク生成タスクを engine に応じた方式へ振り分ける。 +// 使用方法: tasks の各要素に対して await し、manifest に格納する timeline を受け取る。 +// オプションや引数詳細: voicevox-query は VOICEVOX manifest 由来の task のみ対応し、任意 wav は rhubarb を使う。 +const generateTask = async (rhubarbBin, task) => { + if (task.engine === "voicevox-query") { + return generateVoicevoxQueryTask(task); + } + + return generateRhubarbTask(rhubarbBin, task); +}; + const parsedArgs = parseArgs(); if (parsedArgs.mode === "help") { printHelp(); @@ -523,13 +576,14 @@ outPath, rawOutPath, manifestPath, - sourceManifestPaths, + sourceManifests, shouldMergeExistingManifest, } = parsedArgs; - const rhubarbBin = await findRhubarbBin(); const tasks = audioPath ? [await taskForAudioPath({audioPath, outPath, rawOutPath})] - : await tasksForVoicevoxManifests(sourceManifestPaths); + : await tasksForVoicevoxManifests(sourceManifests); + const needsRhubarb = tasks.some((task) => task.engine !== "voicevox-query"); + const rhubarbBin = needsRhubarb ? await findRhubarbBin() : undefined; const generatedManifest = shouldMergeExistingManifest ? await loadExistingGeneratedManifest(manifestPath) : {version: 1, timelines: {}}; diff --git a/voicevox-remotion-template/scripts/lipsync-utils.js b/voicevox-remotion-template/scripts/lipsync-utils.js index af61e4f..e83ae6a 100644 --- a/voicevox-remotion-template/scripts/lipsync-utils.js +++ b/voicevox-remotion-template/scripts/lipsync-utils.js @@ -30,6 +30,17 @@ return "rest"; }; +const VOICEVOX_VOWEL_TO_JA_MOUTH = Object.freeze({ + a: "a", + i: "i", + u: "u", + e: "e", + o: "o", + N: "closed", +}); + +const CLOSED_CONSONANTS = new Set(["p", "b", "m"]); + const assertFiniteNumber = (value, fieldName, index) => { if (!Number.isFinite(value)) { throw new Error(`mouthCues[${index}].${fieldName} must be a number.`); @@ -80,3 +91,135 @@ warnings, }; }; + +const positiveDuration = (value) => + typeof value === "number" && Number.isFinite(value) && value > 0 ? value : 0; + +const voicevoxMoraToParts = (mora) => { + const parts = []; + const consonantLength = positiveDuration(mora?.consonantLength); + if (consonantLength > 0) { + const consonant = mora?.consonant; + parts.push({ + duration: consonantLength, + mouth: CLOSED_CONSONANTS.has(consonant) ? "closed" : "rest", + source: consonant ? `consonant:${consonant}` : "consonant", + }); + } + + const vowelLength = positiveDuration(mora?.vowelLength); + if (vowelLength > 0) { + const vowel = mora?.vowel; + parts.push({ + duration: vowelLength, + mouth: VOICEVOX_VOWEL_TO_JA_MOUTH[vowel] ?? "rest", + source: vowel ? `vowel:${vowel}` : "vowel", + }); + } + + return parts; +}; + +const appendCuePart = (parts, part) => { + if (part.duration <= 0) { + return; + } + + parts.push(part); +}; + +const voicevoxTimingToParts = (voicevoxTiming) => { + const parts = []; + appendCuePart(parts, { + duration: positiveDuration(voicevoxTiming?.prePhonemeLength), + mouth: "rest", + source: "prePhoneme", + }); + + for (const phrase of voicevoxTiming?.accentPhrases ?? []) { + for (const mora of phrase?.moras ?? []) { + for (const part of voicevoxMoraToParts(mora)) { + appendCuePart(parts, part); + } + } + + if (phrase?.pauseMora) { + appendCuePart(parts, { + duration: positiveDuration(phrase.pauseMora.vowelLength), + mouth: "rest", + source: "pause", + }); + } + } + + appendCuePart(parts, { + duration: positiveDuration(voicevoxTiming?.postPhonemeLength), + mouth: "rest", + source: "postPhoneme", + }); + + return parts; +}; + +const mergeCue = (cues, cue) => { + const previous = cues.at(-1); + if (previous && previous.mouth === cue.mouth) { + previous.end = cue.end; + previous.source = + previous.source === cue.source + ? previous.source + : `${previous.source}+${cue.source}`; + return; + } + + cues.push(cue); +}; + +export const normalizeVoicevoxTiming = ( + voicevoxTiming, + {audio = "", durationSeconds} = {} +) => { + if (!voicevoxTiming || !Array.isArray(voicevoxTiming.accentPhrases)) { + throw new Error("VOICEVOX timing must contain an accentPhrases array."); + } + + const parts = voicevoxTimingToParts(voicevoxTiming); + const computedDuration = parts.reduce((sum, part) => sum + part.duration, 0); + const targetDuration = + typeof durationSeconds === "number" && Number.isFinite(durationSeconds) + ? durationSeconds + : computedDuration; + const scale = + computedDuration > 0 && targetDuration > 0 + ? targetDuration / computedDuration + : 1; + const cues = []; + let cursor = 0; + + for (const part of parts) { + const duration = part.duration * scale; + if (duration <= 0) { + continue; + } + + const start = cursor; + const end = cursor + duration; + mergeCue(cues, { + start, + end, + mouth: part.mouth, + source: part.source, + }); + cursor = end; + } + + return { + version: 1, + source: { + audio, + engine: "voicevox-query", + }, + duration: targetDuration, + cues, + }; +}; diff --git a/voicevox-remotion-template/scripts/lipsync-utils.test.js b/voicevox-remotion-template/scripts/lipsync-utils.test.js index 133bb5d..e94981d 100644 --- a/voicevox-remotion-template/scripts/lipsync-utils.test.js +++ b/voicevox-remotion-template/scripts/lipsync-utils.test.js @@ -1,6 +1,6 @@ import assert from "node:assert/strict"; import {test} from "node:test"; -import {normalizeRhubarbJson} from "./lipsync-utils.js"; +import {normalizeRhubarbJson, normalizeVoicevoxTiming} from "./lipsync-utils.js"; test("maps Rhubarb mouth shapes to Japanese mouth shapes", () => { const {timeline} = normalizeRhubarbJson( @@ -52,3 +52,76 @@ assert.equal(timeline.duration, 0.8); }); + +test("builds mouth cues from VOICEVOX vowels", () => { + const timeline = normalizeVoicevoxTiming( + { + prePhonemeLength: 0.1, + postPhonemeLength: 0.1, + accentPhrases: [ + { + moras: [ + {vowel: "a", vowelLength: 0.2}, + {vowel: "i", vowelLength: 0.2}, + {vowel: "u", vowelLength: 0.2}, + {vowel: "e", vowelLength: 0.2}, + {vowel: "o", vowelLength: 0.2}, + {vowel: "N", vowelLength: 0.2}, + ], + }, + ], + }, + {audio: "audio/example.wav"} + ); + + assert.deepEqual( + timeline.cues.map((cue) => cue.mouth), + ["rest", "a", "i", "u", "e", "o", "closed", "rest"] + ); + assert.equal(timeline.source.engine, "voicevox-query"); +}); + +test("maps selected VOICEVOX consonants to closed and others to rest", () => { + const timeline = normalizeVoicevoxTiming({ + accentPhrases: [ + { + moras: [ + {consonant: "p", consonantLength: 0.1, vowel: "a", vowelLength: 0.1}, + {consonant: "m", consonantLength: 0.1, vowel: "i", vowelLength: 0.1}, + {consonant: "k", consonantLength: 0.1, vowel: "u", vowelLength: 0.1}, + ], + }, + ], + }); + + assert.deepEqual( + timeline.cues.map((cue) => cue.mouth), + ["closed", "a", "closed", "i", "rest", "u"] + ); +}); + +test("scales VOICEVOX timing duration and merges adjacent mouths", () => { + const timeline = normalizeVoicevoxTiming( + { + prePhonemeLength: 0.1, + postPhonemeLength: 0.1, + accentPhrases: [ + { + moras: [ + {consonant: "k", consonantLength: 0.1, vowel: "x", vowelLength: 0.1}, + ], + pauseMora: {vowel: "pau", vowelLength: 0.1}, + }, + ], + }, + {durationSeconds: 1} + ); + + assert.equal(timeline.duration, 1); + assert.deepEqual( + timeline.cues.map((cue) => cue.mouth), + ["rest"] + ); + assert.equal(timeline.cues[0].start, 0); + assert.equal(timeline.cues[0].end, 1); +}); diff --git a/voicevox-remotion-template/scripts/voicevox-generate.js b/voicevox-remotion-template/scripts/voicevox-generate.js index dc39d11..f6cefc8 100644 --- a/voicevox-remotion-template/scripts/voicevox-generate.js +++ b/voicevox-remotion-template/scripts/voicevox-generate.js @@ -37,6 +37,7 @@ script: "src/data/zundamon-rework-standee-demo/script.ts", output: "public/audio/zundamon-rework-standee-demo/lines", manifest: "src/data/zundamon-rework-standee-demo/voicevox-manifest.json", + lipsyncEngine: "voicevox-query", }, ]; @@ -58,6 +59,7 @@ scriptPath: resolveProjectPath(target.script), outputDir: resolveProjectPath(target.output), manifestPath: resolveProjectPath(target.manifest), + lipsyncEngine: target.lipsyncEngine ?? "rhubarb", }); // 用途: npm run voice:generate のヘルプを表示し、明示的な生成コマンドを案内する。 @@ -354,6 +356,35 @@ return style.id; }; +// 用途: VOICEVOX audio_query から口形生成に必要なタイミング情報だけを取り出す。 +// 使用方法: voicevox-query lip sync を使う対象の manifest entry に保存する。 +// オプションや引数詳細: query は VOICEVOX の audio_query レスポンスで、mora と pause の長さだけを保持する。 +const extractVoicevoxTiming = (query) => ({ + prePhonemeLength: query.prePhonemeLength, + postPhonemeLength: query.postPhonemeLength, + speedScale: query.speedScale, + accentPhrases: query.accent_phrases.map((phrase) => ({ + moras: phrase.moras.map((mora) => ({ + text: mora.text, + consonant: mora.consonant, + consonantLength: mora.consonant_length, + vowel: mora.vowel, + vowelLength: mora.vowel_length, + })), + ...(phrase.pause_mora + ? { + pauseMora: { + text: phrase.pause_mora.text, + consonant: phrase.pause_mora.consonant, + consonantLength: phrase.pause_mora.consonant_length, + vowel: phrase.pause_mora.vowel, + vowelLength: phrase.pause_mora.vowel_length, + }, + } + : {}), + })), +}); + // 用途: 1つのコンポジション設定に対して VOICEVOX 音声と manifest を生成する。 // 使用方法: CLI で解決した targets を順に渡し、共有済み speakers を使って await する。 // オプションや引数詳細: target は name・scriptPath・outputDir・manifestPath、speakers は /speakers のレスポンスを渡す。 @@ -417,7 +448,7 @@ const outputPath = path.join(outputDir, `${speech.id}.wav`); await fs.writeFile(outputPath, audioBuffer); const durationSeconds = getWavDurationSeconds(audioBuffer); - manifest.push({ + const manifestEntry = { id: speech.id, character: speech.character, speakerName: voice.speakerName, @@ -425,7 +456,13 @@ speakerId, file: `${publicRelativeOutputDir}/${speech.id}.wav`, durationSeconds, - }); + }; + + if (target.lipsyncEngine === "voicevox-query") { + manifestEntry.voicevoxTiming = extractVoicevoxTiming(query); + } + + manifest.push(manifestEntry); console.log( `Wrote ${outputPath} (${voice.speakerName} / ${voice.styleName}, ${durationSeconds.toFixed(2)}s)` ); diff --git a/voicevox-remotion-template/src/data/zundamon-rework-standee-demo/voicevox-manifest.json b/voicevox-remotion-template/src/data/zundamon-rework-standee-demo/voicevox-manifest.json index 80f28bb..653e07a 100644 --- a/voicevox-remotion-template/src/data/zundamon-rework-standee-demo/voicevox-manifest.json +++ b/voicevox-remotion-template/src/data/zundamon-rework-standee-demo/voicevox-manifest.json @@ -6,6 +6,272 @@ "styleName": "ノーマル", "speakerId": 3, "file": "audio/zundamon-rework-standee-demo/lines/zundamon-rework-standee-demo-zunda-001.wav", - "durationSeconds": 16.373333333333335 + "durationSeconds": 16.373333333333335, + "voicevoxTiming": { + "prePhonemeLength": 0.1, + "postPhonemeLength": 0.1, + "speedScale": 1.02, + "accentPhrases": [ + { + "moras": [ + { + "text": "ア", + "consonant": null, + "consonantLength": null, + "vowel": "a", + "vowelLength": 0.46016019582748413 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 1.19928777217865 + } + }, + { + "moras": [ + { + "text": "イ", + "consonant": null, + "consonantLength": null, + "vowel": "i", + "vowelLength": 0.4315135180950165 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 1.4939818382263184 + } + }, + { + "moras": [ + { + "text": "ウ", + "consonant": null, + "consonantLength": null, + "vowel": "u", + "vowelLength": 0.47278499603271484 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 1.360854148864746 + } + }, + { + "moras": [ + { + "text": "エ", + "consonant": null, + "consonantLength": null, + "vowel": "e", + "vowelLength": 0.480140745639801 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 1.277888536453247 + } + }, + { + "moras": [ + { + "text": "オ", + "consonant": null, + "consonantLength": null, + "vowel": "o", + "vowelLength": 0.40523019433021545 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 1.101143479347229 + } + }, + { + "moras": [ + { + "text": "パ", + "consonant": "p", + "consonantLength": 0.1379900425672531, + "vowel": "a", + "vowelLength": 0.2939148545265198 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 0.5955608487129211 + } + }, + { + "moras": [ + { + "text": "ピ", + "consonant": "p", + "consonantLength": 0.11043529957532883, + "vowel": "i", + "vowelLength": 0.21403124928474426 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 0.7473998069763184 + } + }, + { + "moras": [ + { + "text": "プ", + "consonant": "p", + "consonantLength": 0.10759306699037552, + "vowel": "u", + "vowelLength": 0.27301403880119324 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 0.6883317828178406 + } + }, + { + "moras": [ + { + "text": "ペ", + "consonant": "p", + "consonantLength": 0.1006699874997139, + "vowel": "e", + "vowelLength": 0.3309115469455719 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 0.5737814903259277 + } + }, + { + "moras": [ + { + "text": "ポ", + "consonant": "p", + "consonantLength": 0.11192715913057327, + "vowel": "o", + "vowelLength": 0.34295541048049927 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 0.6980905532836914 + } + }, + { + "moras": [ + { + "text": "ン", + "consonant": null, + "consonantLength": null, + "vowel": "N", + "vowelLength": 0.2119991034269333 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 0.8075965046882629 + } + }, + { + "moras": [ + { + "text": "マ", + "consonant": "m", + "consonantLength": 0.11556573957204819, + "vowel": "a", + "vowelLength": 0.19224266707897186 + }, + { + "text": "ン", + "consonant": null, + "consonantLength": null, + "vowel": "N", + "vowelLength": 0.07576050609350204 + }, + { + "text": "マ", + "consonant": "m", + "consonantLength": 0.05635761469602585, + "vowel": "a", + "vowelLength": 0.16269516944885254 + } + ], + "pauseMora": { + "text": "、", + "consonant": null, + "consonantLength": null, + "vowel": "pau", + "vowelLength": 0.3299731910228729 + } + }, + { + "moras": [ + { + "text": "ナ", + "consonant": "n", + "consonantLength": 0.0706525668501854, + "vowel": "a", + "vowelLength": 0.08505970984697342 + } + ] + }, + { + "moras": [ + { + "text": "ノ", + "consonant": "n", + "consonantLength": 0.05904378741979599, + "vowel": "o", + "vowelLength": 0.09998897463083267 + }, + { + "text": "ダ", + "consonant": "d", + "consonantLength": 0.05008454620838165, + "vowel": "a", + "vowelLength": 0.21048195660114288 + } + ] + } + ] + } } ] diff --git a/voicevox-remotion-template/src/generated/lipsync/manifest.json b/voicevox-remotion-template/src/generated/lipsync/manifest.json index 3cea8ed..24febe9 100644 --- a/voicevox-remotion-template/src/generated/lipsync/manifest.json +++ b/voicevox-remotion-template/src/generated/lipsync/manifest.json @@ -8297,274 +8297,243 @@ "version": 1, "source": { "audio": "audio/zundamon-rework-standee-demo/lines/zundamon-rework-standee-demo-zunda-001.wav", - "engine": "rhubarb-lip-sync", - "recognizer": "phonetic" + "engine": "voicevox-query" }, - "duration": 16.37, + "duration": 16.373333333333335, "cues": [ { "start": 0, - "end": 0.07, + "end": 0.09782661641193983, "mouth": "rest", - "source": "X" + "source": "prePhoneme" }, { - "start": 0.07, - "end": 0.56, - "mouth": "i", - "source": "B" + "start": 0.09782661641193983, + "end": 0.5479857660645239, + "mouth": "a", + "source": "vowel:a" }, { - "start": 0.56, - "end": 1.72, + "start": 0.5479857660645239, + "end": 1.7212084146290305, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 1.72, - "end": 1.97, + "start": 1.7212084146290305, + "end": 2.143343488741509, "mouth": "i", - "source": "B" + "source": "vowel:i" }, { - "start": 1.97, - "end": 2.04, - "mouth": "e", - "source": "C" - }, - { - "start": 2.04, - "end": 2.11, - "mouth": "i", - "source": "B" - }, - { - "start": 2.11, - "end": 3.61, + "start": 2.143343488741509, + "end": 3.6048553708872166, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 3.61, - "end": 3.66, - "mouth": "i", - "source": "B" - }, - { - "start": 3.66, - "end": 3.7, - "mouth": "e", - "source": "C" - }, - { - "start": 3.7, - "end": 3.9, - "mouth": "closed", - "source": "A" - }, - { - "start": 3.9, - "end": 4.01, - "mouth": "i", - "source": "B" - }, - { - "start": 4.01, - "end": 5.38, - "mouth": "rest", - "source": "X" - }, - { - "start": 5.38, - "end": 5.9, - "mouth": "i", - "source": "B" - }, - { - "start": 5.9, - "end": 7.11, - "mouth": "rest", - "source": "X" - }, - { - "start": 7.11, - "end": 7.16, - "mouth": "i", - "source": "B" - }, - { - "start": 7.16, - "end": 7.21, - "mouth": "e", - "source": "C" - }, - { - "start": 7.21, - "end": 7.42, - "mouth": "i", - "source": "B" - }, - { - "start": 7.42, - "end": 7.49, - "mouth": "e", - "source": "C" - }, - { - "start": 7.49, - "end": 8.68, - "mouth": "rest", - "source": "X" - }, - { - "start": 8.68, - "end": 9.04, + "start": 3.6048553708872166, + "end": 4.067364935409346, "mouth": "u", - "source": "F" + "source": "vowel:u" }, { - "start": 9.04, - "end": 9.65, + "start": 4.067364935409346, + "end": 5.398642503545229, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 9.65, - "end": 9.95, - "mouth": "i", - "source": "B" - }, - { - "start": 9.95, - "end": 10.71, - "mouth": "rest", - "source": "X" - }, - { - "start": 10.71, - "end": 10.78, - "mouth": "i", - "source": "B" - }, - { - "start": 10.78, - "end": 10.84, + "start": 5.398642503545229, + "end": 5.8683479490197055, "mouth": "e", - "source": "C" + "source": "vowel:e" }, { - "start": 10.84, - "end": 10.92, - "mouth": "closed", - "source": "A" - }, - { - "start": 10.92, - "end": 11.04, - "mouth": "i", - "source": "B" - }, - { - "start": 11.04, - "end": 11.75, + "start": 5.8683479490197055, + "end": 7.118463065747975, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 11.75, - "end": 11.79, - "mouth": "i", - "source": "B" + "start": 7.118463065747975, + "end": 7.514886053540754, + "mouth": "o", + "source": "vowel:o" }, { - "start": 11.79, - "end": 11.9, - "mouth": "e", - "source": "C" - }, - { - "start": 11.9, - "end": 11.98, - "mouth": "closed", - "source": "A" - }, - { - "start": 11.98, - "end": 12.15, - "mouth": "i", - "source": "B" - }, - { - "start": 12.15, - "end": 12.74, + "start": 7.514886053540754, + "end": 8.592097461226855, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 12.74, - "end": 13.11, - "mouth": "i", - "source": "B" + "start": 8.592097461226855, + "end": 8.727088450855794, + "mouth": "closed", + "source": "consonant:p" }, { - "start": 13.11, - "end": 14.8, + "start": 8.727088450855794, + "end": 9.014615408171164, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 9.014615408171164, + "end": 9.597232435141246, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 14.8, - "end": 14.88, - "mouth": "i", - "source": "B" - }, - { - "start": 14.88, - "end": 15.09, + "start": 9.597232435141246, + "end": 9.70526755204018, "mouth": "closed", - "source": "A" + "source": "consonant:p" }, { - "start": 15.09, - "end": 15.24, + "start": 9.70526755204018, + "end": 9.91464708127965, "mouth": "i", - "source": "B" + "source": "vowel:i" }, { - "start": 15.24, - "end": 15.72, + "start": 9.91464708127965, + "end": 10.645803023513952, + "mouth": "rest", + "source": "pause" + }, + { + "start": 10.645803023513952, + "end": 10.751057680444468, "mouth": "closed", - "source": "A" + "source": "consonant:p" }, { - "start": 15.72, - "end": 15.84, - "mouth": "i", - "source": "B" - }, - { - "start": 15.84, - "end": 15.92, - "mouth": "closed", - "source": "A" - }, - { - "start": 15.92, - "end": 16.15, + "start": 10.751057680444468, + "end": 11.018138076933257, "mouth": "u", - "source": "F" + "source": "vowel:u" }, { - "start": 16.15, - "end": 16.31, - "mouth": "closed", - "source": "A" - }, - { - "start": 16.31, - "end": 16.37, + "start": 11.018138076933257, + "end": 11.691509769751931, "mouth": "rest", - "source": "X" + "source": "pause" + }, + { + "start": 11.691509769751931, + "end": 11.789991812265225, + "mouth": "closed", + "source": "consonant:p" + }, + { + "start": 11.789991812265225, + "end": 12.113711381958485, + "mouth": "e", + "source": "vowel:e" + }, + { + "start": 12.113711381958485, + "end": 12.675022399542343, + "mouth": "rest", + "source": "pause" + }, + { + "start": 12.675022399542343, + "end": 12.78451695216579, + "mouth": "closed", + "source": "consonant:p" + }, + { + "start": 12.78451695216579, + "end": 13.120018626040542, + "mouth": "o", + "source": "vowel:o" + }, + { + "start": 13.120018626040542, + "end": 13.802936993809368, + "mouth": "rest", + "source": "pause" + }, + { + "start": 13.802936993809368, + "end": 14.010328543515586, + "mouth": "closed", + "source": "vowel:N" + }, + { + "start": 14.010328543515586, + "end": 14.800372878313206, + "mouth": "rest", + "source": "pause" + }, + { + "start": 14.800372878313206, + "end": 14.913426931067976, + "mouth": "closed", + "source": "consonant:m" + }, + { + "start": 14.913426931067976, + "end": 15.101491427571403, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 15.101491427571403, + "end": 15.230738114806838, + "mouth": "closed", + "source": "vowel:N+consonant:m" + }, + { + "start": 15.230738114806838, + "end": 15.389897294144323, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 15.389897294144323, + "end": 15.781815917546227, + "mouth": "rest", + "source": "pause+consonant:n" + }, + { + "start": 15.781815917546227, + "end": 15.865026953619335, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 15.865026953619335, + "end": 15.92278749305358, + "mouth": "rest", + "source": "consonant:n" + }, + { + "start": 15.92278749305358, + "end": 16.020603323719918, + "mouth": "o", + "source": "vowel:o" + }, + { + "start": 16.020603323719918, + "end": 16.06959934062085, + "mouth": "rest", + "source": "consonant:d" + }, + { + "start": 16.06959934062085, + "end": 16.275506716921395, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 16.275506716921395, + "end": 16.373333333333335, + "mouth": "rest", + "source": "postPhoneme" } ] } diff --git a/voicevox-remotion-template/src/generated/lipsync/zundamon-rework-standee-demo-zunda-001.mouth.json b/voicevox-remotion-template/src/generated/lipsync/zundamon-rework-standee-demo-zunda-001.mouth.json index 05d6420..ca7f925 100644 --- a/voicevox-remotion-template/src/generated/lipsync/zundamon-rework-standee-demo-zunda-001.mouth.json +++ b/voicevox-remotion-template/src/generated/lipsync/zundamon-rework-standee-demo-zunda-001.mouth.json @@ -2,274 +2,243 @@ "version": 1, "source": { "audio": "audio/zundamon-rework-standee-demo/lines/zundamon-rework-standee-demo-zunda-001.wav", - "engine": "rhubarb-lip-sync", - "recognizer": "phonetic" + "engine": "voicevox-query" }, - "duration": 16.37, + "duration": 16.373333333333335, "cues": [ { "start": 0, - "end": 0.07, + "end": 0.09782661641193983, "mouth": "rest", - "source": "X" + "source": "prePhoneme" }, { - "start": 0.07, - "end": 0.56, - "mouth": "i", - "source": "B" + "start": 0.09782661641193983, + "end": 0.5479857660645239, + "mouth": "a", + "source": "vowel:a" }, { - "start": 0.56, - "end": 1.72, + "start": 0.5479857660645239, + "end": 1.7212084146290305, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 1.72, - "end": 1.97, + "start": 1.7212084146290305, + "end": 2.143343488741509, "mouth": "i", - "source": "B" + "source": "vowel:i" }, { - "start": 1.97, - "end": 2.04, - "mouth": "e", - "source": "C" - }, - { - "start": 2.04, - "end": 2.11, - "mouth": "i", - "source": "B" - }, - { - "start": 2.11, - "end": 3.61, + "start": 2.143343488741509, + "end": 3.6048553708872166, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 3.61, - "end": 3.66, - "mouth": "i", - "source": "B" - }, - { - "start": 3.66, - "end": 3.7, - "mouth": "e", - "source": "C" - }, - { - "start": 3.7, - "end": 3.9, - "mouth": "closed", - "source": "A" - }, - { - "start": 3.9, - "end": 4.01, - "mouth": "i", - "source": "B" - }, - { - "start": 4.01, - "end": 5.38, - "mouth": "rest", - "source": "X" - }, - { - "start": 5.38, - "end": 5.9, - "mouth": "i", - "source": "B" - }, - { - "start": 5.9, - "end": 7.11, - "mouth": "rest", - "source": "X" - }, - { - "start": 7.11, - "end": 7.16, - "mouth": "i", - "source": "B" - }, - { - "start": 7.16, - "end": 7.21, - "mouth": "e", - "source": "C" - }, - { - "start": 7.21, - "end": 7.42, - "mouth": "i", - "source": "B" - }, - { - "start": 7.42, - "end": 7.49, - "mouth": "e", - "source": "C" - }, - { - "start": 7.49, - "end": 8.68, - "mouth": "rest", - "source": "X" - }, - { - "start": 8.68, - "end": 9.04, + "start": 3.6048553708872166, + "end": 4.067364935409346, "mouth": "u", - "source": "F" + "source": "vowel:u" }, { - "start": 9.04, - "end": 9.65, + "start": 4.067364935409346, + "end": 5.398642503545229, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 9.65, - "end": 9.95, - "mouth": "i", - "source": "B" - }, - { - "start": 9.95, - "end": 10.71, - "mouth": "rest", - "source": "X" - }, - { - "start": 10.71, - "end": 10.78, - "mouth": "i", - "source": "B" - }, - { - "start": 10.78, - "end": 10.84, + "start": 5.398642503545229, + "end": 5.8683479490197055, "mouth": "e", - "source": "C" + "source": "vowel:e" }, { - "start": 10.84, - "end": 10.92, - "mouth": "closed", - "source": "A" - }, - { - "start": 10.92, - "end": 11.04, - "mouth": "i", - "source": "B" - }, - { - "start": 11.04, - "end": 11.75, + "start": 5.8683479490197055, + "end": 7.118463065747975, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 11.75, - "end": 11.79, - "mouth": "i", - "source": "B" + "start": 7.118463065747975, + "end": 7.514886053540754, + "mouth": "o", + "source": "vowel:o" }, { - "start": 11.79, - "end": 11.9, - "mouth": "e", - "source": "C" - }, - { - "start": 11.9, - "end": 11.98, - "mouth": "closed", - "source": "A" - }, - { - "start": 11.98, - "end": 12.15, - "mouth": "i", - "source": "B" - }, - { - "start": 12.15, - "end": 12.74, + "start": 7.514886053540754, + "end": 8.592097461226855, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 12.74, - "end": 13.11, - "mouth": "i", - "source": "B" + "start": 8.592097461226855, + "end": 8.727088450855794, + "mouth": "closed", + "source": "consonant:p" }, { - "start": 13.11, - "end": 14.8, + "start": 8.727088450855794, + "end": 9.014615408171164, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 9.014615408171164, + "end": 9.597232435141246, "mouth": "rest", - "source": "X" + "source": "pause" }, { - "start": 14.8, - "end": 14.88, - "mouth": "i", - "source": "B" - }, - { - "start": 14.88, - "end": 15.09, + "start": 9.597232435141246, + "end": 9.70526755204018, "mouth": "closed", - "source": "A" + "source": "consonant:p" }, { - "start": 15.09, - "end": 15.24, + "start": 9.70526755204018, + "end": 9.91464708127965, "mouth": "i", - "source": "B" + "source": "vowel:i" }, { - "start": 15.24, - "end": 15.72, + "start": 9.91464708127965, + "end": 10.645803023513952, + "mouth": "rest", + "source": "pause" + }, + { + "start": 10.645803023513952, + "end": 10.751057680444468, "mouth": "closed", - "source": "A" + "source": "consonant:p" }, { - "start": 15.72, - "end": 15.84, - "mouth": "i", - "source": "B" - }, - { - "start": 15.84, - "end": 15.92, - "mouth": "closed", - "source": "A" - }, - { - "start": 15.92, - "end": 16.15, + "start": 10.751057680444468, + "end": 11.018138076933257, "mouth": "u", - "source": "F" + "source": "vowel:u" }, { - "start": 16.15, - "end": 16.31, - "mouth": "closed", - "source": "A" - }, - { - "start": 16.31, - "end": 16.37, + "start": 11.018138076933257, + "end": 11.691509769751931, "mouth": "rest", - "source": "X" + "source": "pause" + }, + { + "start": 11.691509769751931, + "end": 11.789991812265225, + "mouth": "closed", + "source": "consonant:p" + }, + { + "start": 11.789991812265225, + "end": 12.113711381958485, + "mouth": "e", + "source": "vowel:e" + }, + { + "start": 12.113711381958485, + "end": 12.675022399542343, + "mouth": "rest", + "source": "pause" + }, + { + "start": 12.675022399542343, + "end": 12.78451695216579, + "mouth": "closed", + "source": "consonant:p" + }, + { + "start": 12.78451695216579, + "end": 13.120018626040542, + "mouth": "o", + "source": "vowel:o" + }, + { + "start": 13.120018626040542, + "end": 13.802936993809368, + "mouth": "rest", + "source": "pause" + }, + { + "start": 13.802936993809368, + "end": 14.010328543515586, + "mouth": "closed", + "source": "vowel:N" + }, + { + "start": 14.010328543515586, + "end": 14.800372878313206, + "mouth": "rest", + "source": "pause" + }, + { + "start": 14.800372878313206, + "end": 14.913426931067976, + "mouth": "closed", + "source": "consonant:m" + }, + { + "start": 14.913426931067976, + "end": 15.101491427571403, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 15.101491427571403, + "end": 15.230738114806838, + "mouth": "closed", + "source": "vowel:N+consonant:m" + }, + { + "start": 15.230738114806838, + "end": 15.389897294144323, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 15.389897294144323, + "end": 15.781815917546227, + "mouth": "rest", + "source": "pause+consonant:n" + }, + { + "start": 15.781815917546227, + "end": 15.865026953619335, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 15.865026953619335, + "end": 15.92278749305358, + "mouth": "rest", + "source": "consonant:n" + }, + { + "start": 15.92278749305358, + "end": 16.020603323719918, + "mouth": "o", + "source": "vowel:o" + }, + { + "start": 16.020603323719918, + "end": 16.06959934062085, + "mouth": "rest", + "source": "consonant:d" + }, + { + "start": 16.06959934062085, + "end": 16.275506716921395, + "mouth": "a", + "source": "vowel:a" + }, + { + "start": 16.275506716921395, + "end": 16.373333333333335, + "mouth": "rest", + "source": "postPhoneme" } ] } diff --git a/voicevox-remotion-template/src/lipsync/types.ts b/voicevox-remotion-template/src/lipsync/types.ts index 980ef73..cd40205 100644 --- a/voicevox-remotion-template/src/lipsync/types.ts +++ b/voicevox-remotion-template/src/lipsync/types.ts @@ -18,8 +18,8 @@ version: 1; source: Readonly<{ audio: string; - engine: "rhubarb-lip-sync"; - recognizer: "phonetic"; + engine: "rhubarb-lip-sync" | "voicevox-query"; + recognizer?: "phonetic"; }>; duration: number; cues: MouthCue[]; diff --git a/voicevox-remotion-template/src/zundamon-rework-standee-demo.tsx b/voicevox-remotion-template/src/zundamon-rework-standee-demo.tsx index 5b4f478..9eb0504 100644 --- a/voicevox-remotion-template/src/zundamon-rework-standee-demo.tsx +++ b/voicevox-remotion-template/src/zundamon-rework-standee-demo.tsx @@ -190,7 +190,7 @@ ? `expression: ${activeEvent.label}` : activeEvent?.type === "mouthCycle" ? `mouth: ${activeMouth}` - : "default rhubarb lip sync"; + : "default voicevox-query lip sync"; const sequences = scheduledEvents.map((scheduledEvent, index) => (