Newer
Older
remotion_docker_devcontainer / voicevox-remotion-template / scripts / voicevox-generate.js
import fs from "node:fs/promises";
import path from "node:path";
import {fileURLToPath} from "node:url";
import ts from "typescript";

const VOICEVOX_URL =
  process.env.VOICEVOX_URL ?? "http://host.docker.internal:50021";

const projectRoot = path.resolve(
  fileURLToPath(new URL("..", import.meta.url))
);
const publicDir = path.join(projectRoot, "public");

const resolveProjectPath = (value) =>
  path.isAbsolute(value) ? value : path.resolve(projectRoot, value);

const parseArgs = () => {
  const values = {
    script: "src/data/script.ts",
    output: "public/audio/lines",
    manifest: "src/data/voicevox-manifest.json",
  };
  const args = process.argv.slice(2);

  for (let index = 0; index < args.length; index += 1) {
    const arg = args[index];
    if (!arg.startsWith("--")) {
      throw new Error(`Unknown argument "${arg}".`);
    }

    const key = arg.slice(2);
    if (!(key in values)) {
      throw new Error(`Unknown option "${arg}".`);
    }

    const value = args[index + 1];
    if (!value || value.startsWith("--")) {
      throw new Error(`Option "${arg}" needs a value.`);
    }

    values[key] = value;
    index += 1;
  }

  return {
    scriptPath: resolveProjectPath(values.script),
    outputDir: resolveProjectPath(values.output),
    manifestPath: resolveProjectPath(values.manifest),
  };
};

const {scriptPath, outputDir, manifestPath} = parseArgs();

const toProjectRelative = (targetPath) =>
  path.relative(projectRoot, targetPath).split(path.sep).join("/");

const toPublicRelative = (targetPath) => {
  const relativePath = path.relative(publicDir, targetPath);
  if (relativePath.startsWith("..") || path.isAbsolute(relativePath)) {
    throw new Error(
      `Output directory must be inside public/: ${toProjectRelative(targetPath)}`
    );
  }

  return relativePath.split(path.sep).join("/");
};

const publicRelativeOutputDir = toPublicRelative(outputDir);

const getWavDurationSeconds = (buffer) => {
  if (buffer.toString("ascii", 0, 4) !== "RIFF") {
    throw new Error("Invalid WAV header: RIFF not found.");
  }
  if (buffer.toString("ascii", 8, 12) !== "WAVE") {
    throw new Error("Invalid WAV header: WAVE not found.");
  }

  let offset = 12;
  let byteRate = 0;
  let dataSize = 0;

  while (offset + 8 <= buffer.length) {
    const chunkId = buffer.toString("ascii", offset, offset + 4);
    const chunkSize = buffer.readUInt32LE(offset + 4);
    if (chunkId === "fmt ") {
      byteRate = buffer.readUInt32LE(offset + 16);
    }
    if (chunkId === "data") {
      dataSize = chunkSize;
      break;
    }
    offset += 8 + chunkSize + (chunkSize % 2);
  }

  if (!byteRate || !dataSize) {
    throw new Error("Failed to read WAV duration.");
  }

  return dataSize / byteRate;
};

const loadScriptModule = async () => {
  const source = await fs.readFile(scriptPath, "utf8");
  const transpiled = ts.transpileModule(source, {
    compilerOptions: {
      module: ts.ModuleKind.ES2022,
      target: ts.ScriptTarget.ES2022,
    },
    fileName: scriptPath,
  });
  const errors = transpiled.diagnostics?.filter(
    (diagnostic) => diagnostic.category === ts.DiagnosticCategory.Error
  );

  if (errors?.length) {
    const message = errors
      .map((diagnostic) =>
        ts.flattenDiagnosticMessageText(diagnostic.messageText, "\n")
      )
      .join("\n");
    throw new Error(
      `Failed to transpile ${toProjectRelative(scriptPath)}:\n${message}`
    );
  }

  const moduleUrl = `data:text/javascript;base64,${Buffer.from(
    transpiled.outputText
  ).toString("base64")}`;
  return import(moduleUrl);
};

const fetchSpeakers = async () => {
  const response = await fetch(`${VOICEVOX_URL}/speakers`);
  if (!response.ok) {
    throw new Error(`speakers failed: ${response.status}`);
  }

  return response.json();
};

const resolveVoice = (characters, speech) => {
  const character = characters[speech.character];
  if (!character) {
    throw new Error(`Unknown character "${speech.character}" in ${speech.id}.`);
  }

  return {
    speakerName:
      speech.voicevox?.speakerName ?? character.voicevox?.speakerName,
    styleName: speech.voicevox?.styleName ?? character.voicevox?.styleName,
  };
};

const resolveSpeakerId = (speakers, voice, speechId) => {
  const speaker = speakers.find(({name}) => name === voice.speakerName);
  if (!speaker) {
    const names = speakers.map(({name}) => name).join(", ");
    throw new Error(
      `Speaker "${voice.speakerName}" for ${speechId} was not found. Available speakers: ${names}`
    );
  }

  const style = speaker.styles.find(({name}) => name === voice.styleName);
  if (!style) {
    const styles = speaker.styles.map(({name}) => name).join(", ");
    throw new Error(
      `Style "${voice.styleName}" for ${speechId} was not found on "${voice.speakerName}". Available styles: ${styles}`
    );
  }

  return style.id;
};

const {characters, timeline} = await loadScriptModule();
if (!characters || !timeline) {
  throw new Error(
    `${toProjectRelative(scriptPath)} must export characters and timeline.`
  );
}

const speechEvents = timeline.filter((event) => event?.type === "say");
if (speechEvents.length === 0) {
  throw new Error(`${toProjectRelative(scriptPath)} has no say(...) events.`);
}

const speakers = await fetchSpeakers();

await fs.mkdir(outputDir, {recursive: true});
const manifest = [];

for (const speech of speechEvents) {
  if (!speech?.id || !speech?.text || !speech?.character) {
    throw new Error("Each say(...) entry needs id, character, and text.");
  }

  const voice = resolveVoice(characters, speech);
  const speakerId = resolveSpeakerId(speakers, voice, speech.id);
  const queryResponse = await fetch(
    `${VOICEVOX_URL}/audio_query?text=${encodeURIComponent(speech.text)}&speaker=${speakerId}`,
    {method: "POST"}
  );
  if (!queryResponse.ok) {
    throw new Error(`audio_query failed: ${queryResponse.status}`);
  }

  const query = await queryResponse.json();
  query.speedScale = 1.02;
  query.pitchScale = 0.0;
  query.intonationScale = 1.1;

  const synthResponse = await fetch(
    `${VOICEVOX_URL}/synthesis?speaker=${speakerId}`,
    {
      method: "POST",
      headers: {"Content-Type": "application/json"},
      body: JSON.stringify(query),
    }
  );
  if (!synthResponse.ok) {
    throw new Error(`synthesis failed: ${synthResponse.status}`);
  }

  const audioBuffer = Buffer.from(await synthResponse.arrayBuffer());
  const outputPath = path.join(outputDir, `${speech.id}.wav`);
  await fs.writeFile(outputPath, audioBuffer);
  const durationSeconds = getWavDurationSeconds(audioBuffer);
  manifest.push({
    id: speech.id,
    character: speech.character,
    speakerName: voice.speakerName,
    styleName: voice.styleName,
    speakerId,
    file: `${publicRelativeOutputDir}/${speech.id}.wav`,
    durationSeconds,
  });
  console.log(
    `Wrote ${outputPath} (${voice.speakerName} / ${voice.styleName}, ${durationSeconds.toFixed(2)}s)`
  );
}

await fs.mkdir(path.dirname(manifestPath), {recursive: true});
await fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2) + "\n");
console.log(`Updated ${manifestPath}`);