Skip to main content

Endpoint

wss://api.inworld.ai/api/v1/realtime/session?key=<session-id>&protocol=realtime
ParameterRequiredDescription
keyYesSession ID from your app
protocolYesrealtime

Authentication

EnvironmentHeaderNotes
Server-side (Node.js)Authorization: Basic <base64-api-key>The API key from Inworld Portal is already Base64-encoded
Client-side (browser)Authorization: Bearer <jwt-token>Mint a JWT on your backend. See the JWT sample app for a complete example

Flow

  1. Connect → receive session.created
  2. Send session.update (instructions, audio config, tools)
  3. Stream audio (input_audio_buffer.append) or text (conversation.item.create)
  4. response.create → handle response.output_* until response.done

Session Config

session.update accepts partial updates so you are able to dynamically update your prompt, voice, model, tools, and so on during the conversation.
ws.send(JSON.stringify({
  type: 'session.update',
  session: {
    type: 'realtime',
    modelId: 'openai/gpt-4o-mini',
    instructions: 'You are a concise concierge.',
    output_modalities: ['audio', 'text'],
    audio: {
      input: {
        turn_detection: {
          type: 'semantic_vad',
          eagerness: 'medium',
          create_response: true,
          interrupt_response: true
        }
      },
      output: {
        voice: 'Clive',
        model: 'inworld-tts-1.5-mini',
        speed: 1.0
      }
    },
    tools: [{
      type: 'function',
      name: 'get_weather',
      description: 'Fetch weather for a location',
      parameters: {
        type: 'object',
        properties: { location: { type: 'string' } },
        required: ['location']
      }
    }]
  }
}));

Audio

Input and output audio should be PCM16, 24 kHz mono, base64 encoded. Recommended chunk size is 100-200ms.
ws.send(JSON.stringify({ type: 'input_audio_buffer.append', audio: base64PcmChunk }));
Use input_audio_buffer.clear to discard unwanted audio.

Text

The Realtime API can accept text as well as audio. Send it from your client using conversation.item.create.
ws.send(JSON.stringify({
  type: 'conversation.item.create',
  item: {
    type: 'message',
    role: 'user',
    content: [{
      type: 'input_text',
      text: 'Can you summarize the notes I sent?'
    }]
  }
}));

Events

Speech-to-speech conversations are facilitated by websocket events - both client-sent events which you’ll send to the API, and server-sent events which you’ll receive and react to.
  • Session: session.created, session.updated
  • Conversation: conversation.item.added/done/retrieved/deleted/truncated, transcription deltas/completions
  • Responses: response.created, response.output_item.added/done, response.output_text.delta/done, response.output_audio.delta/done, response.done
  • Audio/VAD: input_audio_buffer.speech_started, input_audio_buffer.speech_stopped, response.output_audio_transcript.delta
  • Errors: error
The full list of events and their schemas is available in the API reference.

Node.js websocket server example

Server-side Node.js example using the ws library with Basic auth.
import WebSocket from 'ws';

const sessionId = 'your-session-id';
const credentials = process.env.INWORLD_API_KEY;

const ws = new WebSocket(`wss://api.inworld.ai/api/v1/realtime/session?key=${sessionId}&protocol=realtime`, {
  headers: {
    Authorization: `Basic ${credentials}`
  }
});

ws.on('open', () => {
  console.log('WebSocket connected');
});

ws.on('message', (buffer) => {
  const message = JSON.parse(buffer.toString());

  switch (message.type) {
    case 'session.created':
      console.log('Session created:', message.session.id);
      updateSession();
      break;
    case 'session.updated':
      console.log('Session updated');
      sendMessage('Hello!');
      break;
    case 'conversation.item.added':
      console.log('Conversation item added:', message.item.id);
      break;
    case 'conversation.item.done':
      console.log('Conversation item done');
      createResponse();
      break;
    case 'input_audio_buffer.speech_started':
      console.log('Speech started at', message.audio_start_ms, 'ms');
      break;
    case 'input_audio_buffer.speech_stopped':
      console.log('Speech stopped at', message.audio_end_ms, 'ms');
      break;
    case 'conversation.item.input_audio_transcription.delta':
      console.log('Transcription delta:', message.delta);
      break;
    case 'conversation.item.input_audio_transcription.completed':
      console.log('Transcription complete:', message.transcript);
      break;
    case 'response.created':
      console.log('Response created:', message.response.id);
      break;
    case 'response.output_item.added':
      console.log('Output item added:', message.item.id);
      break;
    case 'response.output_text.delta':
      console.log('Text delta:', message.delta);
      break;
    case 'response.output_audio.delta':
      // Decode and play audio chunk
      const audioBuffer = Buffer.from(message.delta, 'base64');
      playAudio(audioBuffer);
      break;
    case 'response.output_audio_transcript.delta':
      console.log('Audio transcript delta:', message.delta);
      break;
    case 'response.done':
      console.log('Response complete, status:', message.response.status);
      break;
    case 'error':
      console.error('Error:', message.error.message, message.error.code);
      break;
  }
});

function updateSession() {
  ws.send(JSON.stringify({
    type: 'session.update',
    session: {
      type: 'realtime',
      output_modalities: ['text', 'audio'],
      instructions: 'You are a helpful AI assistant.',
      audio: {
        input: {
          turn_detection: {
            type: 'semantic_vad',
            eagerness: 'medium',
            create_response: true,
            interrupt_response: true
          }
        },
        output: {
          voice: 'Clive'
        }
      }
    }
  }));
}

function sendMessage(text) {
  ws.send(JSON.stringify({
    type: 'conversation.item.create',
    item: {
      type: 'message',
      role: 'user',
      content: [{ type: 'input_text', text }]
    }
  }));
}

function createResponse() {
  ws.send(JSON.stringify({
    type: 'response.create',
    response: {
      output_modalities: ['text', 'audio']
    }
  }));
}

function cancelResponse() {
  ws.send(JSON.stringify({ type: 'response.cancel' }));
}

function sendAudioChunk(audioChunk) {
  ws.send(JSON.stringify({
    type: 'input_audio_buffer.append',
    audio: audioChunk // base64-encoded audio data
  }));
}

function clearAudioBuffer() {
  ws.send(JSON.stringify({ type: 'input_audio_buffer.clear' }));
}
API reference for full schemas.