Documentation Index
Fetch the complete documentation index at: https://docs.inworld.ai/llms.txt
Use this file to discover all available pages before exploring further.
Connect via WebSocket. For browser-native, low-latency voice, see WebRTC.
Endpoint
wss://api.inworld.ai/api/v1/realtime/session?key=<session-id>&protocol=realtime
| Parameter | Required | Description |
|---|
key | Yes | Session ID from your app |
protocol | Yes | realtime |
Authentication
| Environment | Header | Notes |
|---|
| Server-side (Node.js) | Authorization: Basic <base64-api-key> | The API key from Inworld Portal is already Base64-encoded |
| Client-side (browser) | Authorization: Bearer <jwt-token> | Mint a JWT on your backend. See the JWT sample app for a complete example |
Flow
- Connect → receive
session.created
- Send
session.update (instructions, audio config, tools)
- Stream audio (
input_audio_buffer.append) or text (conversation.item.create)
response.create → handle response.output_* until response.done
Session Config
session.update accepts partial updates so you are able to dynamically update your prompt, voice, model, tools, and so on during the conversation.
ws.send(JSON.stringify({
type: 'session.update',
session: {
type: 'realtime',
model: 'openai/gpt-4o-mini',
instructions: 'You are a concise concierge.',
output_modalities: ['audio', 'text'],
audio: {
input: {
turn_detection: {
type: 'semantic_vad',
eagerness: 'medium',
create_response: true,
interrupt_response: true
}
},
output: {
voice: 'Clive',
model: 'inworld-tts-1.5-mini',
speed: 1.0
}
},
tools: [{
type: 'function',
name: 'get_weather',
description: 'Fetch weather for a location',
parameters: {
type: 'object',
properties: { location: { type: 'string' } },
required: ['location']
}
}]
}
}));
Audio
The default audio format is PCM16 at 24 kHz, mono, base64-encoded. The API also accepts audio/pcmu and audio/pcma (G.711) at 8 kHz for telephony use cases. Set the format in session.update. Recommended chunk size is 60-100ms.
ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: base64PcmChunk }));
Use input_audio_buffer.clear to discard unwanted audio.
Text
The Realtime API can accept text as well as audio. Send it from your client using conversation.item.create.
ws.send(JSON.stringify({
type: 'conversation.item.create',
item: {
type: 'message',
role: 'user',
content: [{
type: 'input_text',
text: 'Can you summarize the notes I sent?'
}]
}
}));
Events
Speech-to-speech conversations are facilitated by websocket events - both client-sent events which you’ll send to the API, and server-sent events which you’ll receive and react to.
- Session:
session.created, session.updated
- Conversation:
conversation.item.added/done/retrieved/deleted/truncated, transcription deltas/completions
- Responses:
response.created, response.output_item.added/done, response.output_text.delta/done, response.output_audio.delta/done, response.done
- Audio/VAD:
input_audio_buffer.speech_started, input_audio_buffer.speech_stopped, response.output_audio_transcript.delta
- Errors:
error
The full list of events and their schemas is available in the API reference.
Node.js websocket server example
Server-side Node.js example using the ws library with Basic auth.
import WebSocket from 'ws';
const sessionId = 'your-session-id';
const credentials = process.env.INWORLD_API_KEY;
const ws = new WebSocket(`wss://api.inworld.ai/api/v1/realtime/session?key=${sessionId}&protocol=realtime`, {
headers: {
Authorization: `Basic ${credentials}`
}
});
ws.on('open', () => {
console.log('WebSocket connected');
});
ws.on('message', (buffer) => {
const message = JSON.parse(buffer.toString());
switch (message.type) {
case 'session.created':
console.log('Session created:', message.session.id);
updateSession();
break;
case 'session.updated':
console.log('Session updated');
sendMessage('Hello!');
break;
case 'conversation.item.added':
console.log('Conversation item added:', message.item.id);
break;
case 'conversation.item.done':
console.log('Conversation item done');
createResponse();
break;
case 'input_audio_buffer.speech_started':
console.log('Speech started at', message.audio_start_ms, 'ms');
break;
case 'input_audio_buffer.speech_stopped':
console.log('Speech stopped at', message.audio_end_ms, 'ms');
break;
case 'conversation.item.input_audio_transcription.delta':
console.log('Transcription delta:', message.delta);
break;
case 'conversation.item.input_audio_transcription.completed':
console.log('Transcription complete:', message.transcript);
break;
case 'response.created':
console.log('Response created:', message.response.id);
break;
case 'response.output_item.added':
console.log('Output item added:', message.item.id);
break;
case 'response.output_text.delta':
console.log('Text delta:', message.delta);
break;
case 'response.output_audio.delta':
// Decode and play audio chunk
const audioBuffer = Buffer.from(message.delta, 'base64');
playAudio(audioBuffer);
break;
case 'response.output_audio_transcript.delta':
console.log('Audio transcript delta:', message.delta);
break;
case 'response.done':
console.log('Response complete, status:', message.response.status);
break;
case 'error':
console.error('Error:', message.error.message, message.error.code);
break;
}
});
function updateSession() {
ws.send(JSON.stringify({
type: 'session.update',
session: {
type: 'realtime',
output_modalities: ['text', 'audio'],
instructions: 'You are a helpful AI assistant.',
audio: {
input: {
turn_detection: {
type: 'semantic_vad',
eagerness: 'medium',
create_response: true,
interrupt_response: true
}
},
output: {
voice: 'Clive'
}
}
}
}));
}
function sendMessage(text) {
ws.send(JSON.stringify({
type: 'conversation.item.create',
item: {
type: 'message',
role: 'user',
content: [{ type: 'input_text', text }]
}
}));
}
function createResponse() {
ws.send(JSON.stringify({
type: 'response.create',
response: {
output_modalities: ['text', 'audio']
}
}));
}
function cancelResponse() {
ws.send(JSON.stringify({ type: 'response.cancel' }));
}
function sendAudioChunk(audioChunk) {
ws.send(JSON.stringify({
type: 'input_audio_buffer.append',
audio: audioChunk // base64-encoded audio data
}));
}
function clearAudioBuffer() {
ws.send(JSON.stringify({ type: 'input_audio_buffer.clear' }));
}
API reference for full schemas.