Newer
Older
remotion_docker_devcontainer / voicevox-remotion-template / scripts / lipsync-utils.js
export const RHUBARB_TO_JA_MOUTH = Object.freeze({
  X: "rest",
  A: "closed",
  B: "i",
  C: "e",
  D: "a",
  E: "o",
  F: "u",
  G: "i",
  H: "e",
});

export const JAPANESE_MOUTH_SHAPES = Object.freeze([
  "a",
  "i",
  "u",
  "e",
  "o",
  "closed",
  "rest",
]);

export const mapRhubarbMouthToJapanese = (source, warnings = []) => {
  const mouth = RHUBARB_TO_JA_MOUTH[source];
  if (mouth) {
    return mouth;
  }

  warnings.push(`Unknown Rhubarb mouth shape "${source}", using "rest".`);
  return "rest";
};

const VOICEVOX_VOWEL_TO_JA_MOUTH = Object.freeze({
  a: "a",
  i: "i",
  u: "u",
  e: "e",
  o: "o",
  N: "closed",
});

const CLOSED_CONSONANTS = new Set(["p", "b", "m"]);

const assertFiniteNumber = (value, fieldName, index) => {
  if (!Number.isFinite(value)) {
    throw new Error(`mouthCues[${index}].${fieldName} must be a number.`);
  }
};

export const normalizeRhubarbJson = (
  rhubarbJson,
  {audio, recognizer = "phonetic"} = {}
) => {
  if (!rhubarbJson || !Array.isArray(rhubarbJson.mouthCues)) {
    throw new Error("Rhubarb JSON must contain a mouthCues array.");
  }

  const warnings = [];
  const cues = rhubarbJson.mouthCues.map((cue, index) => {
    const start = Number(cue?.start);
    const end = Number(cue?.end);
    assertFiniteNumber(start, "start", index);
    assertFiniteNumber(end, "end", index);

    const source = typeof cue?.value === "string" ? cue.value : String(cue?.value);

    return {
      start,
      end,
      mouth: mapRhubarbMouthToJapanese(source, warnings),
      source,
    };
  });

  const metadataDuration = Number(rhubarbJson.metadata?.duration);
  const duration = Number.isFinite(metadataDuration)
    ? metadataDuration
    : cues.at(-1)?.end ?? 0;

  return {
    timeline: {
      version: 1,
      source: {
        audio: audio ?? rhubarbJson.metadata?.soundFile ?? "",
        engine: "rhubarb-lip-sync",
        recognizer,
      },
      duration,
      cues,
    },
    warnings,
  };
};

const positiveDuration = (value) =>
  typeof value === "number" && Number.isFinite(value) && value > 0 ? value : 0;

const voicevoxMoraToParts = (mora) => {
  const parts = [];
  const consonantLength = positiveDuration(mora?.consonantLength);
  if (consonantLength > 0) {
    const consonant = mora?.consonant;
    parts.push({
      duration: consonantLength,
      mouth: CLOSED_CONSONANTS.has(consonant) ? "closed" : "rest",
      source: consonant ? `consonant:${consonant}` : "consonant",
    });
  }

  const vowelLength = positiveDuration(mora?.vowelLength);
  if (vowelLength > 0) {
    const vowel = mora?.vowel;
    parts.push({
      duration: vowelLength,
      mouth: VOICEVOX_VOWEL_TO_JA_MOUTH[vowel] ?? "rest",
      source: vowel ? `vowel:${vowel}` : "vowel",
    });
  }

  return parts;
};

const appendCuePart = (parts, part) => {
  if (part.duration <= 0) {
    return;
  }

  parts.push(part);
};

const voicevoxTimingToParts = (voicevoxTiming) => {
  const parts = [];
  appendCuePart(parts, {
    duration: positiveDuration(voicevoxTiming?.prePhonemeLength),
    mouth: "rest",
    source: "prePhoneme",
  });

  for (const phrase of voicevoxTiming?.accentPhrases ?? []) {
    for (const mora of phrase?.moras ?? []) {
      for (const part of voicevoxMoraToParts(mora)) {
        appendCuePart(parts, part);
      }
    }

    if (phrase?.pauseMora) {
      appendCuePart(parts, {
        duration: positiveDuration(phrase.pauseMora.vowelLength),
        mouth: "rest",
        source: "pause",
      });
    }
  }

  appendCuePart(parts, {
    duration: positiveDuration(voicevoxTiming?.postPhonemeLength),
    mouth: "rest",
    source: "postPhoneme",
  });

  return parts;
};

const mergeCue = (cues, cue) => {
  const previous = cues.at(-1);
  if (previous && previous.mouth === cue.mouth) {
    previous.end = cue.end;
    previous.source =
      previous.source === cue.source
        ? previous.source
        : `${previous.source}+${cue.source}`;
    return;
  }

  cues.push(cue);
};

export const normalizeVoicevoxTiming = (
  voicevoxTiming,
  {audio = "", durationSeconds} = {}
) => {
  if (!voicevoxTiming || !Array.isArray(voicevoxTiming.accentPhrases)) {
    throw new Error("VOICEVOX timing must contain an accentPhrases array.");
  }

  const parts = voicevoxTimingToParts(voicevoxTiming);
  const computedDuration = parts.reduce((sum, part) => sum + part.duration, 0);
  const targetDuration =
    typeof durationSeconds === "number" && Number.isFinite(durationSeconds)
      ? durationSeconds
      : computedDuration;
  const scale =
    computedDuration > 0 && targetDuration > 0
      ? targetDuration / computedDuration
      : 1;
  const cues = [];
  let cursor = 0;

  for (const part of parts) {
    const duration = part.duration * scale;
    if (duration <= 0) {
      continue;
    }

    const start = cursor;
    const end = cursor + duration;
    mergeCue(cues, {
      start,
      end,
      mouth: part.mouth,
      source: part.source,
    });
    cursor = end;
  }

  return {
    version: 1,
    source: {
      audio,
      engine: "voicevox-query",
    },
    duration: targetDuration,
    cues,
  };
};