> ## Documentation Index
> Fetch the complete documentation index at: https://docs.inworld.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Transcribe audio

> Send the whole audio in a single request, and receive a single transcription.

<RequestExample>
  ```bash cURL theme={"system"}
  curl --request POST \
    --url https://api.inworld.ai/stt/v1/transcribe \
    --header "Authorization: Basic $INWORLD_API_KEY" \
    --header "Content-Type: application/json" \
    --data '{
      "transcribeConfig": {
        "modelId": "groq/whisper-large-v3",
        "audioEncoding": "LINEAR16",
        "language": "en-US",
        "sampleRateHertz": 16000,
        "numberOfChannels": 1
      },
      "audioData": {
        "content": "<YOUR_AUDIO>"
      }
    }'
  ```

  ```python Python theme={"system"}
  import requests
  import base64

  url = "https://api.inworld.ai/stt/v1/transcribe"
  api_key = "$INWORLD_API_KEY"

  with open("audio.wav", "rb") as f:
      audio_base64 = base64.b64encode(f.read()).decode("utf-8")

  response = requests.post(
      url,
      headers={
          "Authorization": f"Basic {api_key}",
          "Content-Type": "application/json",
      },
      json={
          "transcribeConfig": {
              "modelId": "groq/whisper-large-v3",
              "audioEncoding": "LINEAR16",
              "language": "en-US",
              "sampleRateHertz": 16000,
              "numberOfChannels": 1,
          },
          "audioData": {
              "content": audio_base64,
          },
      },
  )

  print(response.json())
  ```

  ```javascript JavaScript theme={"system"}
  const fs = require("fs");

  const apiKey = "$INWORLD_API_KEY";
  const audioBase64 = fs.readFileSync("audio.wav").toString("base64");

  const response = await fetch("https://api.inworld.ai/stt/v1/transcribe", {
    method: "POST",
    headers: {
      "Authorization": `Basic ${apiKey}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      transcribeConfig: {
        modelId: "groq/whisper-large-v3",
        audioEncoding: "LINEAR16",
        language: "en-US",
        sampleRateHertz: 16000,
        numberOfChannels: 1,
      },
      audioData: {
        content: audioBase64,
      },
    }),
  });

  const data = await response.json();
  console.log(data);
  ```
</RequestExample>

<ResponseExample>
  ```json 200 theme={"system"}
  {
    "transcription": {
      "transcript": "Hey, I just wanted to check in on the delivery status for my order.",
      "isFinal": true,
      "wordTimestamps": []
    },
    "usage": null
  }
  ```
</ResponseExample>


## OpenAPI

````yaml post /stt/v1/transcribe
openapi: 3.0.0
info:
  title: Inworld Speech-To-Text API
  version: v1
  contact:
    name: Inworld AI
    url: https://inworld.ai
    email: support@inworld.ai
servers:
  - url: https://api.inworld.ai
security:
  - inworld_basic: []
tags:
  - name: SpeechToText
paths:
  /stt/v1/transcribe:
    post:
      tags:
        - SpeechToText
      summary: Get Speech-to-Text transcription synchronously
      description: >-
        Send the whole audio in a single request, and receive a single
        transcription.
      operationId: SpeechToText_TranscribeSpeech
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/v1TranscribeSpeechRequest'
        description: Request message for the synchronous TranscribeSpeech API.
        required: true
      responses:
        '200':
          description: A successful response.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/v1TranscribeSpeechResponse'
        4XX:
          description: An error response.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/rpcStatus'
      x-codeSamples:
        - lang: bash
          label: cURL
          source: |-
            curl --location 'https://api.inworld.ai/stt/v1/transcribe' \
            --header "Authorization: Basic $INWORLD_API_KEY" \
            --header 'Content-Type: application/json' \
            --data '{
              "audio": {
                "audioData": "<base64-audio-data>"
              },
              "config": {
                "languageCode": "en-US"
              }
            }'
        - lang: python
          label: Python
          source: |-
            import requests

            url = "https://api.inworld.ai/stt/v1/transcribe"
            headers = {
                "Authorization": "Basic <api-key>",
                "Content-Type": "application/json"
            }
            payload = {
                "audio": {
                    "audioData": "<base64-audio-data>"
                },
                "config": {
                    "languageCode": "en-US"
                }
            }

            response = requests.post(url, json=payload, headers=headers)
            print(response.json())
        - lang: javascript
          label: JavaScript
          source: |-
            const url = 'https://api.inworld.ai/stt/v1/transcribe';

            const response = await fetch(url, {
              method: 'POST',
              headers: {
                'Authorization': 'Basic <api-key>',
                'Content-Type': 'application/json',
              },
              body: JSON.stringify({
                audio: {
                  audioData: '<base64-audio-data>',
                },
                config: {
                  languageCode: 'en-US',
                },
              }),
            });

            const data = await response.json();
            console.log(data);
components:
  schemas:
    v1TranscribeSpeechRequest:
      type: object
      properties:
        transcribeConfig:
          $ref: '#/components/schemas/v1TranscribeConfig'
        audioData:
          $ref: '#/components/schemas/sttv1AudioContent'
      required:
        - transcribeConfig
        - audioData
    v1TranscribeSpeechResponse:
      type: object
      properties:
        transcription:
          $ref: '#/components/schemas/v1Transcription'
        usage:
          $ref: '#/components/schemas/inworldsttv1Usage'
    rpcStatus:
      type: object
      properties:
        code:
          type: integer
          format: int32
          description: >-
            The error code, as specified by [gRPC status
            codes](https://grpc.io/docs/guides/status-codes/).
          example: 3
        message:
          type: string
          description: A short description of the error.
          example: Unsupported audio encoding.
        details:
          type: array
          items:
            $ref: '#/components/schemas/protobufAny'
    v1TranscribeConfig:
      type: object
      allOf:
        - oneOf:
            - title: Groq
              type: object
              properties:
                groqConfig:
                  $ref: '#/components/schemas/v1GroqConfig'
              required:
                - groqConfig
            - title: Inworld STT V1
              type: object
              properties:
                inworldSttV1Config:
                  $ref: '#/components/schemas/v1InworldSttV1Config'
              required:
                - inworldSttV1Config
            - title: No provider-specific config
              type: object
              description: Omit provider-specific configuration entirely.
              properties: {}
        - type: object
          properties:
            modelId:
              type: string
              description: >-
                The identifier of the model to use for transcription.

                Format: "{provider}/{model-name}".


                Available models:

                - `inworld/inworld-stt-1` — Inworld first-party (Sync +
                WebSocket)

                - `groq/whisper-large-v3` — Groq Whisper (Sync only)

                - `assemblyai/universal-streaming-multilingual` — AssemblyAI
                multilingual (WebSocket only)

                - `assemblyai/universal-streaming-english` — AssemblyAI English
                (WebSocket only)

                - `assemblyai/u3-rt-pro` — AssemblyAI high-accuracy (WebSocket
                only)

                - `assemblyai/whisper-rt` — AssemblyAI Whisper real-time
                (WebSocket only)

                - `soniox/stt-rt-v4` — Soniox real-time (WebSocket only)


                See [STT Introduction](/stt/overview) for the full model
                catalogue.
              example: groq/whisper-large-v3
            audioEncoding:
              $ref: '#/components/schemas/v1TranscribeConfigAudioEncoding'
              example: LINEAR16
            language:
              type: string
              description: >-
                Language code for speech recognition in BCP-47 format (e.g.,
                "en-US", "ja-JP").

                If not specified, the model will attempt to auto-detect the
                language.
              example: en-US
            sampleRateHertz:
              type: integer
              format: int32
              description: >-
                Sample rate of the audio data in Hertz. Required when the sample
                rate

                cannot be inferred from the audio header (e.g., raw PCM
                streams).

                If not set - default sample rate 16000 will be used.
              example: 16000
            numberOfChannels:
              type: integer
              format: int32
              description: >-
                Number of channels in the audio data. Required when the number
                of channels

                cannot be inferred from the audio header (e.g., raw PCM
                streams).

                If not set - default number of channels 1 will be used.
              example: 1
            inactivityTimeoutSeconds:
              type: integer
              format: int32
              description: >-
                Inactivity timeout in seconds. If the client is silent for this
                duration, the transcription will be stopped.
            endOfTurnConfidenceThreshold:
              type: number
              format: float
              description: >-
                Confidence threshold for end-of-turn prediction. Higher values
                reduce

                false-positives. Range: [0.0, 1.0]. Default: 0.5.
              example: 0.5
            prompts:
              type: array
              items:
                type: string
              description: >-
                Contextual prompt to guide the model (e.g., domain-specific
                context).
            includeWordTimestamps:
              type: boolean
              description: If true, includes per-word timing information in the response.
            voiceProfileConfig:
              $ref: '#/components/schemas/v1VoiceProfileConfig'
          required:
            - modelId
            - audioEncoding
      description: >-
        Configuration for transcribing audio.

        Contains model selection, audio format settings, and optional feature
        configurations.

        Provider-specific configuration is mutually exclusive — set at most one
        of groqConfig or inworldSttV1Config.
    sttv1AudioContent:
      type: object
      properties:
        content:
          type: string
          format: byte
          description: >-
            The raw audio bytes in the encoding specified by
            TranscribeConfig.audio_encoding.
          example: <YOUR_AUDIO>
      description: Container for raw audio data bytes.
      required:
        - content
    v1Transcription:
      type: object
      properties:
        transcript:
          type: string
          description: Full transcribed text for this segment.
          readOnly: true
        isFinal:
          type: boolean
          description: >-
            Indicates whether this is a finalized result or an interim (partial)
            result

            that may be updated as more audio is processed.
          readOnly: true
        wordTimestamps:
          type: array
          items:
            $ref: '#/components/schemas/TranscriptionWordTimestamp'
          description: >-
            Per-word timing and confidence data. Only populated when
            `includeWordTimestamps` is enabled in the request config. **Coming
            soon** — word timestamps are not yet populated.
          readOnly: true
      description: >-
        Represents the result of a speech-to-text transcription.

        May be an interim (partial) result or a final result depending on the
        is_final field.
    inworldsttv1Usage:
      type: object
      properties:
        transcribedAudioMs:
          type: integer
          format: int32
          description: The duration of the transcribed audio in milliseconds.
          readOnly: true
        modelId:
          type: string
          description: The identifier of the model used for transcription.
          readOnly: true
      description: Usage metrics for billing and monitoring purposes.
    protobufAny:
      type: object
      properties:
        '@type':
          type: string
      additionalProperties: {}
    v1GroqConfig:
      type: object
      properties:
        temperature:
          type: number
          format: float
          description: >-
            Temperature for the model. Controls randomness in predictions.
            Higher values produce more varied output. Range: [0.0, 1.0].
          example: 1
      description: Configuration for Groq STT models.
    v1InworldSttV1Config:
      type: object
      properties:
        minEndOfTurnSilenceWhenConfident:
          type: integer
          format: int32
          description: Minimum silence duration when confidence is high (milliseconds).
        vadThreshold:
          type: number
          format: float
          description: 'Voice activity detection threshold. Range: [0.0, 1.0]. Default: 0.5.'
          example: 0.5
      description: Configuration for Inworld STT 1 models.
    v1TranscribeConfigAudioEncoding:
      type: string
      enum:
        - AUDIO_ENCODING_UNSPECIFIED
        - AUTO_DETECT
        - LINEAR16
        - MP3
        - OGG_OPUS
        - FLAC
      default: AUDIO_ENCODING_UNSPECIFIED
      description: >-
        Supported audio encoding formats.

         - AUDIO_ENCODING_UNSPECIFIED: Not specified. Will return [google.rpc.Code.INVALID_ARGUMENT].
         - AUTO_DETECT: Automatically detect audio encoding from the audio header.
         - LINEAR16: Uncompressed 16-bit signed little-endian samples (Linear PCM).
         - MP3: MP3 audio. Compressed audio format.
        Not supported for streaming transcription.
         - OGG_OPUS: Opus encoded audio wrapped in an OGG container. Playable natively on Android
        and in browsers (Chrome, Firefox). Higher quality than MP3 at similar
        bitrate.

        Not supported for streaming transcription.
         - FLAC: FLAC encoded audio. Lossless audio format.
        Not supported for streaming transcription.
    v1VoiceProfileConfig:
      type: object
      properties:
        enableVoiceProfile:
          type: boolean
          description: Enables voice profile feature for this request or stream.
        topN:
          type: integer
          format: int32
          description: 'Number of top labels from each class to return. Default: 10.'
      required:
        - enableVoiceProfile
      description: Configuration for voice profile detection.
    TranscriptionWordTimestamp:
      type: object
      properties:
        word:
          type: string
          description: The transcribed word.
          readOnly: true
        confidence:
          type: number
          format: float
          description: Recognition confidence score for this word, ranging from 0.0 to 1.0.
          readOnly: true
        startTimeMs:
          type: integer
          format: int32
          description: >-
            Offset from the beginning of the audio to the start of this word, in
            milliseconds.
          readOnly: true
        endTimeMs:
          type: integer
          format: int32
          description: >-
            Offset from the beginning of the audio to the end of this word, in
            milliseconds.
          readOnly: true
      description: Timing and confidence information for a single transcribed word.
  securitySchemes:
    inworld_basic:
      type: apiKey
      in: header
      name: Authorization
      description: >-
        Your [authentication](../../../api-reference/introduction) credentials.
        For Basic authentication, please populate `Basic $INWORLD_API_KEY`

````