Newer
Older
remotion_docker_devcontainer / voicevox-remotion-template / scripts / lipsync-utils.test.js
import assert from "node:assert/strict";
import {test} from "node:test";
import {normalizeRhubarbJson, normalizeVoicevoxTiming} from "./lipsync-utils.js";

test("maps Rhubarb mouth shapes to Japanese mouth shapes", () => {
  const {timeline} = normalizeRhubarbJson(
    {
      metadata: {duration: 1.2},
      mouthCues: [
        {start: 0, end: 0.1, value: "X"},
        {start: 0.1, end: 0.2, value: "D"},
        {start: 0.2, end: 0.3, value: "F"},
      ],
    },
    {audio: "audio/example.wav"}
  );

  assert.deepEqual(
    timeline.cues.map((cue) => cue.mouth),
    ["rest", "a", "u"]
  );
});

test("uses rest for unknown shapes and reports a warning", () => {
  const {timeline, warnings} = normalizeRhubarbJson(
    {
      mouthCues: [{start: 0, end: 0.1, value: "Z"}],
    },
    {audio: "audio/example.wav"}
  );

  assert.equal(timeline.cues[0].mouth, "rest");
  assert.match(warnings[0], /Unknown Rhubarb mouth shape "Z"/);
});

test("uses metadata duration when available", () => {
  const {timeline} = normalizeRhubarbJson({
    metadata: {duration: 2.5},
    mouthCues: [{start: 0, end: 0.1, value: "X"}],
  });

  assert.equal(timeline.duration, 2.5);
});

test("falls back to the last cue end for duration", () => {
  const {timeline} = normalizeRhubarbJson({
    mouthCues: [
      {start: 0, end: 0.4, value: "X"},
      {start: 0.4, end: 0.8, value: "D"},
    ],
  });

  assert.equal(timeline.duration, 0.8);
});

test("builds mouth cues from VOICEVOX vowels", () => {
  const timeline = normalizeVoicevoxTiming(
    {
      prePhonemeLength: 0.1,
      postPhonemeLength: 0.1,
      accentPhrases: [
        {
          moras: [
            {vowel: "a", vowelLength: 0.2},
            {vowel: "i", vowelLength: 0.2},
            {vowel: "u", vowelLength: 0.2},
            {vowel: "e", vowelLength: 0.2},
            {vowel: "o", vowelLength: 0.2},
            {vowel: "N", vowelLength: 0.2},
          ],
        },
      ],
    },
    {audio: "audio/example.wav"}
  );

  assert.deepEqual(
    timeline.cues.map((cue) => cue.mouth),
    ["rest", "a", "i", "u", "e", "o", "closed", "rest"]
  );
  assert.equal(timeline.source.engine, "voicevox-query");
});

test("maps selected VOICEVOX consonants to closed and others to rest", () => {
  const timeline = normalizeVoicevoxTiming({
    accentPhrases: [
      {
        moras: [
          {consonant: "p", consonantLength: 0.1, vowel: "a", vowelLength: 0.1},
          {consonant: "m", consonantLength: 0.1, vowel: "i", vowelLength: 0.1},
          {consonant: "k", consonantLength: 0.1, vowel: "u", vowelLength: 0.1},
        ],
      },
    ],
  });

  assert.deepEqual(
    timeline.cues.map((cue) => cue.mouth),
    ["closed", "a", "closed", "i", "rest", "u"]
  );
});

test("scales VOICEVOX timing duration and merges adjacent mouths", () => {
  const timeline = normalizeVoicevoxTiming(
    {
      prePhonemeLength: 0.1,
      postPhonemeLength: 0.1,
      accentPhrases: [
        {
          moras: [
            {consonant: "k", consonantLength: 0.1, vowel: "x", vowelLength: 0.1},
          ],
          pauseMora: {vowel: "pau", vowelLength: 0.1},
        },
      ],
    },
    {durationSeconds: 1}
  );

  assert.equal(timeline.duration, 1);
  assert.deepEqual(
    timeline.cues.map((cue) => cue.mouth),
    ["rest"]
  );
  assert.equal(timeline.cues[0].start, 0);
  assert.equal(timeline.cues[0].end, 1);
});