export const RHUBARB_TO_JA_MOUTH = Object.freeze({
X: "rest",
A: "closed",
B: "i",
C: "e",
D: "a",
E: "o",
F: "u",
G: "i",
H: "e",
});
export const JAPANESE_MOUTH_SHAPES = Object.freeze([
"a",
"i",
"u",
"e",
"o",
"closed",
"rest",
]);
export const mapRhubarbMouthToJapanese = (source, warnings = []) => {
const mouth = RHUBARB_TO_JA_MOUTH[source];
if (mouth) {
return mouth;
}
warnings.push(`Unknown Rhubarb mouth shape "${source}", using "rest".`);
return "rest";
};
const VOICEVOX_VOWEL_TO_JA_MOUTH = Object.freeze({
a: "a",
i: "i",
u: "u",
e: "e",
o: "o",
N: "closed",
});
const CLOSED_CONSONANTS = new Set(["p", "b", "m"]);
const assertFiniteNumber = (value, fieldName, index) => {
if (!Number.isFinite(value)) {
throw new Error(`mouthCues[${index}].${fieldName} must be a number.`);
}
};
export const normalizeRhubarbJson = (
rhubarbJson,
{audio, recognizer = "phonetic"} = {}
) => {
if (!rhubarbJson || !Array.isArray(rhubarbJson.mouthCues)) {
throw new Error("Rhubarb JSON must contain a mouthCues array.");
}
const warnings = [];
const cues = rhubarbJson.mouthCues.map((cue, index) => {
const start = Number(cue?.start);
const end = Number(cue?.end);
assertFiniteNumber(start, "start", index);
assertFiniteNumber(end, "end", index);
const source = typeof cue?.value === "string" ? cue.value : String(cue?.value);
return {
start,
end,
mouth: mapRhubarbMouthToJapanese(source, warnings),
source,
};
});
const metadataDuration = Number(rhubarbJson.metadata?.duration);
const duration = Number.isFinite(metadataDuration)
? metadataDuration
: cues.at(-1)?.end ?? 0;
return {
timeline: {
version: 1,
source: {
audio: audio ?? rhubarbJson.metadata?.soundFile ?? "",
engine: "rhubarb-lip-sync",
recognizer,
},
duration,
cues,
},
warnings,
};
};
const positiveDuration = (value) =>
typeof value === "number" && Number.isFinite(value) && value > 0 ? value : 0;
const voicevoxMoraToParts = (mora) => {
const parts = [];
const consonantLength = positiveDuration(mora?.consonantLength);
if (consonantLength > 0) {
const consonant = mora?.consonant;
parts.push({
duration: consonantLength,
mouth: CLOSED_CONSONANTS.has(consonant) ? "closed" : "rest",
source: consonant ? `consonant:${consonant}` : "consonant",
});
}
const vowelLength = positiveDuration(mora?.vowelLength);
if (vowelLength > 0) {
const vowel = mora?.vowel;
parts.push({
duration: vowelLength,
mouth: VOICEVOX_VOWEL_TO_JA_MOUTH[vowel] ?? "rest",
source: vowel ? `vowel:${vowel}` : "vowel",
});
}
return parts;
};
const appendCuePart = (parts, part) => {
if (part.duration <= 0) {
return;
}
parts.push(part);
};
const voicevoxTimingToParts = (voicevoxTiming) => {
const parts = [];
appendCuePart(parts, {
duration: positiveDuration(voicevoxTiming?.prePhonemeLength),
mouth: "rest",
source: "prePhoneme",
});
for (const phrase of voicevoxTiming?.accentPhrases ?? []) {
for (const mora of phrase?.moras ?? []) {
for (const part of voicevoxMoraToParts(mora)) {
appendCuePart(parts, part);
}
}
if (phrase?.pauseMora) {
appendCuePart(parts, {
duration: positiveDuration(phrase.pauseMora.vowelLength),
mouth: "rest",
source: "pause",
});
}
}
appendCuePart(parts, {
duration: positiveDuration(voicevoxTiming?.postPhonemeLength),
mouth: "rest",
source: "postPhoneme",
});
return parts;
};
const mergeCue = (cues, cue) => {
const previous = cues.at(-1);
if (previous && previous.mouth === cue.mouth) {
previous.end = cue.end;
previous.source =
previous.source === cue.source
? previous.source
: `${previous.source}+${cue.source}`;
return;
}
cues.push(cue);
};
export const normalizeVoicevoxTiming = (
voicevoxTiming,
{audio = "", durationSeconds} = {}
) => {
if (!voicevoxTiming || !Array.isArray(voicevoxTiming.accentPhrases)) {
throw new Error("VOICEVOX timing must contain an accentPhrases array.");
}
const parts = voicevoxTimingToParts(voicevoxTiming);
const computedDuration = parts.reduce((sum, part) => sum + part.duration, 0);
const targetDuration =
typeof durationSeconds === "number" && Number.isFinite(durationSeconds)
? durationSeconds
: computedDuration;
const scale =
computedDuration > 0 && targetDuration > 0
? targetDuration / computedDuration
: 1;
const cues = [];
let cursor = 0;
for (const part of parts) {
const duration = part.duration * scale;
if (duration <= 0) {
continue;
}
const start = cursor;
const end = cursor + duration;
mergeCue(cues, {
start,
end,
mouth: part.mouth,
source: part.source,
});
cursor = end;
}
return {
version: 1,
source: {
audio,
engine: "voicevox-query",
},
duration: targetDuration,
cues,
};
};