Newer
Older
remotion_docker_devcontainer / voicevox-remotion-template / scripts / voicevox-generate.js
import fs from "node:fs/promises";
import ts from "typescript";

const VOICEVOX_URL =
  process.env.VOICEVOX_URL ?? "http://host.docker.internal:50021";

const scriptPath = new URL("../src/data/script.ts", import.meta.url);
const outputDir = new URL("../public/audio/lines/", import.meta.url);
const manifestPath = new URL(
  "../src/data/voicevox-manifest.json",
  import.meta.url
);

const getWavDurationSeconds = (buffer) => {
  if (buffer.toString("ascii", 0, 4) !== "RIFF") {
    throw new Error("Invalid WAV header: RIFF not found.");
  }
  if (buffer.toString("ascii", 8, 12) !== "WAVE") {
    throw new Error("Invalid WAV header: WAVE not found.");
  }

  let offset = 12;
  let byteRate = 0;
  let dataSize = 0;

  while (offset + 8 <= buffer.length) {
    const chunkId = buffer.toString("ascii", offset, offset + 4);
    const chunkSize = buffer.readUInt32LE(offset + 4);
    if (chunkId === "fmt ") {
      byteRate = buffer.readUInt32LE(offset + 16);
    }
    if (chunkId === "data") {
      dataSize = chunkSize;
      break;
    }
    offset += 8 + chunkSize + (chunkSize % 2);
  }

  if (!byteRate || !dataSize) {
    throw new Error("Failed to read WAV duration.");
  }

  return dataSize / byteRate;
};

const loadScriptModule = async () => {
  const source = await fs.readFile(scriptPath, "utf8");
  const transpiled = ts.transpileModule(source, {
    compilerOptions: {
      module: ts.ModuleKind.ES2022,
      target: ts.ScriptTarget.ES2022,
    },
    fileName: scriptPath.pathname,
  });
  const errors = transpiled.diagnostics?.filter(
    (diagnostic) => diagnostic.category === ts.DiagnosticCategory.Error
  );

  if (errors?.length) {
    const message = errors
      .map((diagnostic) =>
        ts.flattenDiagnosticMessageText(diagnostic.messageText, "\n")
      )
      .join("\n");
    throw new Error(`Failed to transpile src/data/script.ts:\n${message}`);
  }

  const moduleUrl = `data:text/javascript;base64,${Buffer.from(
    transpiled.outputText
  ).toString("base64")}`;
  return import(moduleUrl);
};

const fetchSpeakers = async () => {
  const response = await fetch(`${VOICEVOX_URL}/speakers`);
  if (!response.ok) {
    throw new Error(`speakers failed: ${response.status}`);
  }

  return response.json();
};

const resolveVoice = (characters, speech) => {
  const character = characters[speech.character];
  if (!character) {
    throw new Error(`Unknown character "${speech.character}" in ${speech.id}.`);
  }

  return {
    speakerName:
      speech.voicevox?.speakerName ?? character.voicevox?.speakerName,
    styleName: speech.voicevox?.styleName ?? character.voicevox?.styleName,
  };
};

const resolveSpeakerId = (speakers, voice, speechId) => {
  const speaker = speakers.find(({name}) => name === voice.speakerName);
  if (!speaker) {
    const names = speakers.map(({name}) => name).join(", ");
    throw new Error(
      `Speaker "${voice.speakerName}" for ${speechId} was not found. Available speakers: ${names}`
    );
  }

  const style = speaker.styles.find(({name}) => name === voice.styleName);
  if (!style) {
    const styles = speaker.styles.map(({name}) => name).join(", ");
    throw new Error(
      `Style "${voice.styleName}" for ${speechId} was not found on "${voice.speakerName}". Available styles: ${styles}`
    );
  }

  return style.id;
};

const {characters, timeline} = await loadScriptModule();
if (!characters || !timeline) {
  throw new Error("src/data/script.ts must export characters and timeline.");
}

const speechEvents = timeline.filter((event) => event?.type === "say");
if (speechEvents.length === 0) {
  throw new Error("src/data/script.ts has no say(...) events.");
}

const speakers = await fetchSpeakers();

await fs.mkdir(outputDir, {recursive: true});
const manifest = [];

for (const speech of speechEvents) {
  if (!speech?.id || !speech?.text || !speech?.character) {
    throw new Error("Each say(...) entry needs id, character, and text.");
  }

  const voice = resolveVoice(characters, speech);
  const speakerId = resolveSpeakerId(speakers, voice, speech.id);
  const queryResponse = await fetch(
    `${VOICEVOX_URL}/audio_query?text=${encodeURIComponent(speech.text)}&speaker=${speakerId}`,
    {method: "POST"}
  );
  if (!queryResponse.ok) {
    throw new Error(`audio_query failed: ${queryResponse.status}`);
  }

  const query = await queryResponse.json();
  query.speedScale = 1.02;
  query.pitchScale = 0.0;
  query.intonationScale = 1.1;

  const synthResponse = await fetch(
    `${VOICEVOX_URL}/synthesis?speaker=${speakerId}`,
    {
      method: "POST",
      headers: {"Content-Type": "application/json"},
      body: JSON.stringify(query),
    }
  );
  if (!synthResponse.ok) {
    throw new Error(`synthesis failed: ${synthResponse.status}`);
  }

  const audioBuffer = Buffer.from(await synthResponse.arrayBuffer());
  const outputPath = new URL(`./${speech.id}.wav`, outputDir);
  await fs.writeFile(outputPath, audioBuffer);
  const durationSeconds = getWavDurationSeconds(audioBuffer);
  manifest.push({
    id: speech.id,
    character: speech.character,
    speakerName: voice.speakerName,
    styleName: voice.styleName,
    speakerId,
    file: `audio/lines/${speech.id}.wav`,
    durationSeconds,
  });
  console.log(
    `Wrote ${outputPath.pathname} (${voice.speakerName} / ${voice.styleName}, ${durationSeconds.toFixed(2)}s)`
  );
}

await fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2) + "\n");
console.log(`Updated ${manifestPath.pathname}`);