import 'dotenv/config';
import * as fs from 'fs';
import * as path from 'path';
import {
CustomNode,
SequentialGraphBuilder,
GraphTypes,
ProcessContext,
RemoteLLMChatNode,
RemoteTTSNode,
} from '@inworld/runtime/graph';
import { renderJinja } from '@inworld/runtime/primitives/llm';
// @ts-ignore
import * as wavEncoder from 'wav-encoder';
const voices = [
{
name: "Leroy",
language: "en-US",
speakerId: "Alex", // English voice ID for TTS
},
{
name: "Gael",
language: "es-ES",
speakerId: "Diego", // Spanish voice ID for TTS
},
{
name: "Yuki",
language: "ja-JP",
speakerId: "Asuka", // Japanese voice ID for TTS
}
];
// Jinja template for generating culturally-appropriate introductions
const introductionPrompt = `
A character named {{name}} needs to introduce themselves in {{language}}.
Generate a natural, culturally appropriate self-introduction in that language.
The introduction should:
- Be 1-2 sentences long
- Include their name
- Be friendly and conversational
- Be appropriate for the specified language and culture
Return ONLY the introduction text in the specified language, nothing else.
`;
class PromptBuilderNode extends CustomNode {
async process(
context: ProcessContext,
input: { name: string; language: string; speakerId: string }
): Promise<GraphTypes.LLMChatRequest> {
// Store voice data in ProcessContext so the TTS node can access it later
const datastore = context.getDatastore();
datastore.add('voiceData', {
name: input.name,
language: input.language,
speakerId: input.speakerId,
});
// Render the Jinja template with speaker data
const renderedPrompt = await renderJinja(introductionPrompt, JSON.stringify({
name: input.name,
language: input.language,
}));
console.log(`\n[${input.name}] Generating introduction in ${input.language}...`);
// Return LLM chat request
return new GraphTypes.LLMChatRequest({
messages: [
{
role: 'user',
content: renderedPrompt,
},
],
});
}
}
class TextExtractorNode extends CustomNode {
process(
context: ProcessContext,
input: GraphTypes.Content
): string {
const datastore = context.getDatastore();
const voiceData = datastore.get('voiceData');
const introText = input.content || '';
console.log(`[${voiceData?.name}] Generated text: "${introText}"`);
// Return the text content for TTS processing
return introText;
}
}
async function processVoice(
voice: { name: string; language: string; speakerId: string },
apiKey: string,
outputDirectory: string
): Promise<void> {
console.log(`\nStarting ${voice.name} (${voice.language}, voice: ${voice.speakerId})`);
const promptBuilderNode = new PromptBuilderNode({
id: `prompt-builder-${voice.name}`,
});
const llmNode = new RemoteLLMChatNode({
id: `llm-node-${voice.name}`,
modelName: 'gpt-4o-mini',
provider: 'openai',
textGenerationConfig: {
maxNewTokens: 200,
temperature: 1.1,
},
});
const textExtractorNode = new TextExtractorNode({
id: `text-extractor-${voice.name}`,
});
const ttsNode = new RemoteTTSNode({
id: `tts-node-${voice.name}`,
speakerId: voice.speakerId,
modelId: 'inworld-tts-1-max',
sampleRate: 48000,
temperature: 1.1,
speakingRate: 1,
});
const graph = new SequentialGraphBuilder({
id: `voice-graph-${voice.name}`,
apiKey,
enableRemoteConfig: false,
nodes: [
promptBuilderNode,
llmNode,
textExtractorNode,
ttsNode,
]
})
const executor = graph.build();
try {
// Start the graph execution with the voice configuration as input
const { outputStream } = await executor.start(voice);
let allAudioData: number[] = [];
let processedText = '';
// Process the output stream
for await (const result of outputStream) {
await result.processResponse({
// Handle TTS output stream
TTSOutputStream: async (ttsStream: GraphTypes.TTSOutputStream) => {
console.log(`[${voice.name}] Generating audio with voice ${voice.speakerId}...`);
// Collect audio chunks from the TTS stream
for await (const chunk of ttsStream) {
if (chunk.text) {
processedText += chunk.text;
}
if (chunk.audio?.data) {
allAudioData = allAudioData.concat(Array.from(chunk.audio.data));
}
}
},
});
}
// Save audio to WAV file
if (allAudioData.length > 0) {
const audio = {
sampleRate: 48000,
channelData: [new Float32Array(allAudioData)],
};
const buffer = await wavEncoder.encode(audio);
const outputPath = path.join(
outputDirectory,
`${voice.name}_${voice.language}_introduction.wav`
);
fs.writeFileSync(outputPath, Buffer.from(buffer));
console.log(`[${voice.name}] ✓ Audio saved to: ${outputPath}`);
console.log(`[${voice.name}] Duration: ~${(allAudioData.length / 48000).toFixed(1)}s`);
}
} catch (error) {
console.error(`[${voice.name}] Error during processing:`, error);
throw error;
}
}
/**
* Main function that demonstrates parallel graph execution
* All voice configurations are processed simultaneously
*/
async function main() {
const apiKey = process.env.INWORLD_API_KEY || '';
if (!apiKey) {
throw new Error('Please set INWORLD_API_KEY environment variable');
}
const OUTPUT_DIRECTORY = path.join(__dirname, 'audio_output');
// Ensure output directory exists
if (!fs.existsSync(OUTPUT_DIRECTORY)) {
fs.mkdirSync(OUTPUT_DIRECTORY, { recursive: true });
}
const startTime = Date.now();
// Process in parallel
const processingPromises = voices.map(voice =>
processVoice(voice, apiKey, OUTPUT_DIRECTORY).catch(error => {
console.error(`Failed to process ${voice.name}:`, error.message);
return null;
})
);
await Promise.all(processingPromises);
const duration = ((Date.now() - startTime) / 1000).toFixed(1);
console.log('\n' + '='.repeat(60));
console.log(`All voices completed in ${duration}s (parallel execution)`);
console.log(`Audio files saved in: ${OUTPUT_DIRECTORY}`);
console.log('='.repeat(60) + '\n');
}
// Run the main function
main().catch((error) => {
console.error('Error:', error);
process.exit(1);
});