> ## Documentation Index
> Fetch the complete documentation index at: https://docs.inworld.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Synthesize speech (stream)

> Receive audio chunks as they are individually processed.



## OpenAPI

````yaml post /tts/v1/voice:stream
openapi: 3.0.0
info:
  title: Inworld Text-to-Speech API
  version: v1
  contact:
    name: Inworld AI
    url: https://inworld.ai
    email: support@inworld.ai
servers:
  - url: https://api.inworld.ai
security:
  - inworld_basic: []
tags:
  - name: TextToSpeech
  - name: AudioPromptPreparationService
  - name: SpeechToPhonemesService
paths:
  /tts/v1/voice:stream:
    post:
      tags:
        - TextToSpeech
      summary: Synthesize speech (stream)
      description: Receive audio chunks as they are individually processed.
      operationId: TextToSpeech_SynthesizeSpeech
      requestBody:
        $ref: '#/components/requestBodies/ttsv1SynthesizeSpeechStreamRequest'
      responses:
        '200':
          description: A successful response returns a stream of objects.
          content:
            application/json:
              schema:
                type: object
                properties:
                  result:
                    $ref: '#/components/schemas/ttsv1SynthesizeSpeechResponse'
                    description: >-
                      A chunk containing the audio data. If using PCM, every
                      chunk, not just the initial chunk, will contain a complete
                      WAV header so it can be played independently.
                  error:
                    allOf:
                      - $ref: '#/components/schemas/rpcStatusStream'
                    description: >-
                      A response may contain an `error` object if an error
                      happens in the stream.
              examples:
                default_response:
                  summary: Default streamed response (no timestamps)
                  description: >-
                    Response when timestampType is not specified. Returns audio
                    chunks with usage.
                  value:
                    result:
                      audioContent: UklGRiRQAQBXQVZFZm1...
                      usage:
                        processedCharactersCount: 64
                        modelId: inworld-tts-2
                async_response:
                  summary: ASYNC streamed response with word alignment
                  description: >-
                    With `timestampTransportStrategy: ASYNC` and `timestampType:
                    WORD`, audio chunks arrive first (with `audioContent` and
                    `usage`), followed by trailing timestamp messages (with
                    empty `audioContent` and `timestampInfo`). Each trailing
                    message contains alignment for one word, including
                    phoneme-level `phoneticDetails` with `visemeSymbol` for
                    lip-sync. This reduces first-audio-chunk latency with v1.5+
                    models.
                  value:
                    result:
                      audioContent: ''
                      usage:
                        processedCharactersCount: 0
                        modelId: inworld-tts-2
                      timestampInfo:
                        wordAlignment:
                          words:
                            - Hello
                          wordStartTimeSeconds:
                            - 0.03
                          wordEndTimeSeconds:
                            - 0.46
                          phoneticDetails:
                            - wordIndex: 0
                              phones:
                                - phoneSymbol: h
                                  startTimeSeconds: 0.03
                                  durationSeconds: 0.18
                                  visemeSymbol: aei
                                - phoneSymbol: ə
                                  startTimeSeconds: 0.21
                                  durationSeconds: 0.03
                                  visemeSymbol: aei
                                - phoneSymbol: l
                                  startTimeSeconds: 0.24
                                  durationSeconds: 0.09
                                  visemeSymbol: l
                                - phoneSymbol: oʊ1
                                  startTimeSeconds: 0.33
                                  durationSeconds: 0.13
                                  visemeSymbol: o
                              isPartial: false
        4XX:
          description: An error response.
          content:
            application/json:
              schema:
                type: object
                properties:
                  error:
                    allOf:
                      - $ref: '#/components/schemas/rpcStatusStream'
      x-codeSamples:
        - lang: bash
          label: cURL
          source: |-
            curl --location 'https://api.inworld.ai/tts/v1/voice:stream' \
            --header "Authorization: Basic $INWORLD_API_KEY" \
            --header 'Content-Type: application/json' \
            --data '{
              "text": "Hello, world! What a wonderful day to be a text-to-speech model!",
              "voiceId": "Dennis",
              "modelId": "inworld-tts-2",
              "audioConfig": {
                "audioEncoding": "LINEAR16",
                "sampleRateHertz": 22050
              },
              "deliveryMode": "BALANCED"
            }'
        - lang: python
          label: Python
          source: >-
            import requests


            url = "https://api.inworld.ai/tts/v1/voice:stream"

            headers = {
                "Authorization": "Basic <api-key>",
                "Content-Type": "application/json"
            }

            payload = {
                "text": "Hello, world! What a wonderful day to be a text-to-speech model!",
                "voiceId": "Dennis",
                "modelId": "inworld-tts-2",
                "audioConfig": {
                    "audioEncoding": "LINEAR16",
                    "sampleRateHertz": 22050
                },
                "deliveryMode": "BALANCED"
            }


            response = requests.post(url, json=payload, headers=headers,
            stream=True)

            for chunk in response.iter_content(chunk_size=None):
                print(chunk)
        - lang: javascript
          label: JavaScript
          source: |-
            const url = 'https://api.inworld.ai/tts/v1/voice:stream';

            const response = await fetch(url, {
              method: 'POST',
              headers: {
                'Authorization': 'Basic <api-key>',
                'Content-Type': 'application/json',
              },
              body: JSON.stringify({
                text: 'Hello, world! What a wonderful day to be a text-to-speech model!',
                voiceId: 'Dennis',
                modelId: 'inworld-tts-2',
                audioConfig: {
                  audioEncoding: 'LINEAR16',
                  sampleRateHertz: 22050,
                },
                deliveryMode: 'BALANCED',
              }),
            });

            const reader = response.body.getReader();
            while (true) {
              const { done, value } = await reader.read();
              if (done) break;
              console.log(value);
            }
components:
  requestBodies:
    ttsv1SynthesizeSpeechStreamRequest:
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ttsv1SynthesizeSpeechStreamRequest'
          example:
            text: Hello, world! What a wonderful day to be a text-to-speech model!
            voiceId: Dennis
            modelId: inworld-tts-2
            audioConfig:
              audioEncoding: LINEAR16
              sampleRateHertz: 22050
            deliveryMode: BALANCED
      required: true
  schemas:
    ttsv1SynthesizeSpeechResponse:
      type: object
      properties:
        audioContent:
          type: string
          format: byte
          description: >-
            The audio data bytes encoded in the format as specified in the
            request. For encodings that are wrapped in containers (e.g. MP3,
            OPUS) the header is included. For PCM audio a WAV header is
            included.


            Maximum output audio size of 16MB. To avoid errors with longer
            texts, please use a compressed audio format with an appropriate bit
            rate, or use the streaming endpoint.
        timestampInfo:
          type: object
          properties:
            wordAlignment:
              type: object
              properties:
                words:
                  type: array
                  items:
                    type: string
                  description: Aligned words in order.
                wordStartTimeSeconds:
                  type: array
                  items:
                    type: number
                  description: >-
                    Start time for each word in seconds from the beginning of
                    the audio.
                wordEndTimeSeconds:
                  type: array
                  items:
                    type: number
                  description: >-
                    End time for each word in seconds from the beginning of the
                    audio.
                phoneticDetails:
                  type: array
                  items:
                    type: object
                    properties:
                      wordIndex:
                        type: integer
                        description: >-
                          Index of the word this phonetic detail belongs to
                          (0-based).
                      phones:
                        type: array
                        items:
                          type: object
                          properties:
                            phoneSymbol:
                              type: string
                              description: The phoneme symbol (IPA notation).
                            startTimeSeconds:
                              type: number
                              description: Start time of the phoneme in seconds.
                            durationSeconds:
                              type: number
                              description: Duration of the phoneme in seconds.
                            visemeSymbol:
                              type: string
                              description: >-
                                The viseme symbol for lip-sync animation (e.g.,
                                `aei`, `o`, `bmp`, `fv`, `l`, `r`, `th`, `qw`,
                                `ee`, `chjsh`, `cdgknstxyz`).
                        description: Array of phonemes that make up this word.
                      isPartial:
                        type: boolean
                        description: >-
                          True when the server considers the word potentially
                          unstable (e.g., last word in a non-final streaming
                          update). Clients may choose to delay processing
                          partial words until `isPartial` becomes `false`.
                  description: >-
                    Detailed phoneme-level timing and viseme information. **Only
                    available for TTS 1.5 and TTS-2 models**
                    (`inworld-tts-1.5-mini`, `inworld-tts-1.5-max`,
                    `inworld-tts-2`). Useful for precise lip-sync animation.
              description: Word-level alignment when timestampType is WORD.
            characterAlignment:
              type: object
              properties:
                characters:
                  type: array
                  items:
                    type: string
                  description: >-
                    Aligned characters (including punctuation and spaces) in
                    order.
                characterStartTimeSeconds:
                  type: array
                  items:
                    type: number
                  description: >-
                    Start time for each character in seconds from the beginning
                    of the audio.
                characterEndTimeSeconds:
                  type: array
                  items:
                    type: number
                  description: >-
                    End time for each character in seconds from the beginning of
                    the audio.
              description: Character-level alignment when timestampType is CHARACTER.
          description: Timestamp alignment information (present when alignment is enabled).
    rpcStatusStream:
      type: object
      properties:
        code:
          type: integer
          format: int32
          description: >-
            The error code, as specified by [gRPC status
            codes](https://grpc.io/docs/guides/status-codes/)
          example: 3
        message:
          type: string
          description: A short description of the error.
        details:
          type: array
          items:
            $ref: '#/components/schemas/protobufAny'
          example: []
    ttsv1SynthesizeSpeechStreamRequest:
      type: object
      description: Request body for the streaming speech synthesis endpoint.
      allOf:
        - $ref: '#/components/schemas/ttsv1SynthesizeSpeechRequest'
      properties:
        timestampTransportStrategy:
          type: string
          enum:
            - TIMESTAMP_TRANSPORT_STRATEGY_UNSPECIFIED
            - SYNC
            - ASYNC
          default: TIMESTAMP_TRANSPORT_STRATEGY_UNSPECIFIED
          description: >-
            The transport strategy of timestamps info.


            - `TIMESTAMP_TRANSPORT_STRATEGY_UNSPECIFIED`: The service will
            automatically decide the transport strategy.

            - `SYNC`: Timestamps will be returned in the same message as the
            audio data.

            - `ASYNC`: Timestamps could return in trailing message after the
            audio data. Use this strategy to reduce latency of the first audio
            chunk with v1.5+ models.
    protobufAny:
      type: object
      properties:
        '@type':
          type: string
      additionalProperties: {}
    ttsv1SynthesizeSpeechRequest:
      type: object
      properties:
        text:
          type: string
          description: >-
            The text to be synthesized into speech. Maximum input of 2,000
            characters.
        voiceId:
          type: string
          description: The ID of the voice to use for synthesizing speech.
        audioConfig:
          $ref: '#/components/schemas/ttsv1AudioConfig'
        modelId:
          type: string
          description: >-
            The ID of the model to use for synthesizing speech. See
            [Models](../../../tts/tts-models) for available models.
        language:
          type: string
          description: >-
            BCP-47 language tag (e.g., `en-US`, `fr-FR`, `ja-JP`) specifying the
            language that the given voice should speak the text in. If a
            localized voice prompt exists for the language, it will be used.
            When omitted, the original voice prompt will be used and the
            language will be auto-detected from the input text. If an invalid
            language code is provided, an error will be returned.


            See [Languages](../../../tts/capabilities/multilingual) for more
            details.
        deliveryMode:
          type: string
          enum:
            - DELIVERY_MODE_UNSPECIFIED
            - STABLE
            - BALANCED
            - CREATIVE
          default: DELIVERY_MODE_UNSPECIFIED
          description: >-
            *Only supported by `inworld-tts-2`. The field is ignored on other
            models.*


            Controls how varied the output is. 


            - `DELIVERY_MODE_UNSPECIFIED`: Defaults to `BALANCED` behavior.

            - `STABLE`: Optimizes for more consistent, predictable output.

            - `BALANCED`: Balanced between stability and diversity.

            - `CREATIVE`: Optimizes for increased emotional range and variation.
        temperature:
          type: number
          format: float
          default: 1
          description: >-
            *Ignored on `inworld-tts-2`. Use
            [`deliveryMode`](#body-delivery-mode) instead.*


            Determines the degree of randomness when sampling audio tokens to
            generate the response.


            Defaults to 1.0. Accepts values between 0 (exclusive) and 2
            (inclusive). Higher values will make the output more random and can
            lead to more expressive results. Lower values will make it more
            deterministic. If 0 is provided, the default value will be used.


            For the most stable results, we recommend using the default value.
        timestampType:
          type: string
          enum:
            - TIMESTAMP_TYPE_UNSPECIFIED
            - WORD
            - CHARACTER
          default: TIMESTAMP_TYPE_UNSPECIFIED
          description: >-
            Controls timestamp metadata returned with the audio. When enabled,
            the response includes timing arrays, which can be useful for
            word-highlighting, karaoke-style captions, and lipsync.


            - WORD: Output arrays under `timestampInfo.wordAlignment` (words,
            wordStartTimeSeconds, wordEndTimeSeconds).

            - CHARACTER: Output arrays under `timestampInfo.characterAlignment`
            (characters, characterStartTimeSeconds, characterEndTimeSeconds).

            - TIMESTAMP_TYPE_UNSPECIFIED: Do not compute alignment; timestamp
            arrays will be empty or omitted.


            **Phonetic details:** `phoneticDetails` is currently only returned
            for **WORD** alignment (not CHARACTER).


            **Latency note:** Alignment adds additional computation. Enabling
            alignment can increase latency.


            **Model differences:**

            - **TTS 1.0 models** (`inworld-tts-1`, `inworld-tts-1-max`): Returns
            basic word/character timing arrays.

            - **TTS 1.5 and TTS-2 models** (`inworld-tts-1.5-mini`,
            `inworld-tts-1.5-max`, `inworld-tts-2`): Returns enhanced alignment
            data with detailed `phoneticDetails` containing phoneme-level timing
            and viseme symbols for lip-sync.
        applyTextNormalization:
          type: string
          enum:
            - APPLY_TEXT_NORMALIZATION_UNSPECIFIED
            - 'ON'
            - 'OFF'
          default: APPLY_TEXT_NORMALIZATION_UNSPECIFIED
          description: >-
            When enabled, text normalization automatically expands and
            standardizes things like numbers, dates, times, and abbreviations
            before converting them to speech. For example, Dr. Smith becomes
            Doctor Smith, and 3/10/25 is spoken as March tenth, twenty
            twenty-five. Turning this off may reduce latency, but the speech
            output will read the text exactly as written. Defaults to
            automatically deciding whether to apply text normalization.
      required:
        - text
        - voiceId
        - modelId
    ttsv1AudioConfig:
      type: object
      properties:
        audioEncoding:
          $ref: '#/components/schemas/v1AudioConfigAudioEncoding'
        bitRate:
          type: integer
          format: int32
          description: >-
            Bits per second of the audio. Only for compressed audio formats
            (`MP3`, `OGG_OPUS`). The default is 128,000.
        sampleRateHertz:
          type: integer
          format: int32
          description: >-
            The synthesis sample rate (in hertz) for this audio. Accepts values
            within the range [8000, 48000]. Supported sample rates are: 8000,
            16000, 22050, 24000, 32000, 44100, 48000.

             When this is specified, if this is different from the voice's natural sample rate, then the audio will be converted to the desired sample rate (which might result in worse audio quality), unless the specified sample rate is not supported for the encoding chosen, in which case it will fail the request and return an error. The default is 48,000.
        speakingRate:
          type: number
          format: double
          description: >-
            Speaking rate/speed, in the range [0.5, 1.5]. The default is 1.0,
            which is the normal native speed supported by the specific voice. We
            recommend using values above 0.8 to ensure high quality.
      description: Configurations to use when synthesizing speech.
    v1AudioConfigAudioEncoding:
      type: string
      enum:
        - LINEAR16
        - MP3
        - OGG_OPUS
        - ALAW
        - MULAW
        - FLAC
        - PCM
        - WAV
      default: MP3
      description: |-
        The desired output format of the synthesized audio. Defaults to `MP3`.
         - `LINEAR16`: Uncompressed 16-bit signed little-endian samples (Linear PCM). For non-streaming, the WAV header is included in the response. For streaming, the WAV header is included in every audio chunk.
         - `MP3`: MP3 audio.
         - `OGG_OPUS`: Opus encoded audio wrapped in an ogg container. The result will be a file which can be played natively on Android, and in browsers (at least Chrome and Firefox). The quality of the encoding is considerably higher than MP3 while using approximately the same bitrate.
         - `ALAW`: ALAW encoded audio. 8-bit companded PCM.
         - `MULAW`: MULAW encoded audio. 8-bit companded PCM.
         - `FLAC`: FLAC encoded audio. Lossless audio format.
         - `PCM`: PCM audio. Uncompressed 16-bit signed little-endian samples with no WAV header.
         - `WAV`: WAV audio. Uncompressed 16-bit signed little-endian samples. For non-streaming, the WAV header is included in the response. For streaming, the WAV header is included in the first audio chunk only.
  securitySchemes:
    inworld_basic:
      type: apiKey
      in: header
      name: Authorization
      description: >-
        Your [authentication](../../../api-reference/introduction) credentials.
        For Basic authentication, please populate `Basic $INWORLD_API_KEY`

````