Newer
Older
remotion_docker_devcontainer / voicevox-remotion-template / scripts / lipsync-utils.js
export const RHUBARB_TO_JA_MOUTH = Object.freeze({
  X: "rest",
  A: "closed",
  B: "i",
  C: "e",
  D: "a",
  E: "o",
  F: "u",
  G: "i",
  H: "e",
});

export const JAPANESE_MOUTH_SHAPES = Object.freeze([
  "a",
  "i",
  "u",
  "e",
  "o",
  "closed",
  "rest",
]);

export const mapRhubarbMouthToJapanese = (source, warnings = []) => {
  const mouth = RHUBARB_TO_JA_MOUTH[source];
  if (mouth) {
    return mouth;
  }

  warnings.push(`Unknown Rhubarb mouth shape "${source}", using "rest".`);
  return "rest";
};

const assertFiniteNumber = (value, fieldName, index) => {
  if (!Number.isFinite(value)) {
    throw new Error(`mouthCues[${index}].${fieldName} must be a number.`);
  }
};

export const normalizeRhubarbJson = (
  rhubarbJson,
  {audio, recognizer = "phonetic"} = {}
) => {
  if (!rhubarbJson || !Array.isArray(rhubarbJson.mouthCues)) {
    throw new Error("Rhubarb JSON must contain a mouthCues array.");
  }

  const warnings = [];
  const cues = rhubarbJson.mouthCues.map((cue, index) => {
    const start = Number(cue?.start);
    const end = Number(cue?.end);
    assertFiniteNumber(start, "start", index);
    assertFiniteNumber(end, "end", index);

    const source = typeof cue?.value === "string" ? cue.value : String(cue?.value);

    return {
      start,
      end,
      mouth: mapRhubarbMouthToJapanese(source, warnings),
      source,
    };
  });

  const metadataDuration = Number(rhubarbJson.metadata?.duration);
  const duration = Number.isFinite(metadataDuration)
    ? metadataDuration
    : cues.at(-1)?.end ?? 0;

  return {
    timeline: {
      version: 1,
      source: {
        audio: audio ?? rhubarbJson.metadata?.soundFile ?? "",
        engine: "rhubarb-lip-sync",
        recognizer,
      },
      duration,
      cues,
    },
    warnings,
  };
};