> ## Documentation Index
> Fetch the complete documentation index at: https://friendli.ai/docs/llms.txt
> Use this file to discover all available pages before exploring further.

# Container Audio Transcriptions

> Transcribe audio files to text using Friendli Container. Run speech-to-text models locally on your own GPU hardware with full data privacy.

Given an audio file, the model transcribes it into text.


## OpenAPI

````yaml https://github.com/friendliai/friendli-openapi/raw/refs/heads/main/openapi.yaml post /v1/audio/transcriptions
openapi: 3.1.0
info:
  title: Friendli Suite API Reference
  description: This is an OpenAPI reference of Friendli Suite API.
  termsOfService: https://friendli.ai/terms-of-service
  contact:
    name: FriendliAI Support Team
    email: support@friendli.ai
  version: 0.1.0
servers:
  - url: https://api.friendli.ai
security: []
tags:
  - name: Serverless.Chat
  - name: Serverless.ToolAssistedChat
  - name: Serverless.Messages
  - name: Serverless.ChatRender
  - name: Serverless.Completions
  - name: Serverless.Token
  - name: Serverless.Audio
  - name: Serverless.Model
  - name: Serverless.Knowledge
  - name: Dedicated.Chat
  - name: Dedicated.Messages
  - name: Dedicated.ChatRender
  - name: Dedicated.Completions
  - name: Dedicated.Embeddings
  - name: Dedicated.TextClassification
  - name: Dedicated.Token
  - name: Dedicated.Image
  - name: Dedicated.Audio
  - name: Dedicated.Endpoint
  - name: Container.Chat
  - name: Container.Messages
  - name: Container.Completions
  - name: Container.TextClassification
  - name: Container.Token
  - name: Container.Image
  - name: Container.Audio
  - name: Cost
  - name: Dataset
  - name: File
paths:
  /v1/audio/transcriptions:
    servers:
      - url: http://localhost:8000
    post:
      tags:
        - Container.Audio
      summary: Audio transcriptions
      description: Given an audio file, the model transcribes it into text.
      operationId: containerAudioTranscriptions
      requestBody:
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/ContainerAudioTranscriptionBody'
        required: true
      responses:
        '200':
          description: Successfully transcribed the audio file.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ContainerAudioTranscriptionSuccess'
              examples:
                Example:
                  value:
                    text: Hello, how are you?
                    usage:
                      type: tokens
                      input_tokens: 20
                      output_tokens: 10
                      total_tokens: 30
                      input_audio_length_ms: 18000
                      processed_audio_length_ms: 24000
                      input_token_details:
                        audio_tokens: 10
                        text_tokens: 10
        '422':
          description: Unprocessable Entity
components:
  schemas:
    ContainerAudioTranscriptionBody:
      properties:
        model:
          anyOf:
            - type: string
            - type: 'null'
          title: Model
          description: Routes the request to a specific adapter.
          examples:
            - (adapter-route)
        file:
          type: string
          format: binary
          title: File
          description: >-
            The audio file object (not file name) to transcribe, in one of these
            formats: mp3, wav, flac, ogg, and many other standard audio formats.
        chunking_strategy:
          anyOf:
            - type: string
              const: auto
            - $ref: '#/components/schemas/ServerVadChunkingStrategy'
            - type: 'null'
          title: Chunking Strategy
          description: >-
            Controls how the audio is cut into chunks. When set to `"auto"`, the
            server first normalizes loudness and then uses voice activity
            detection (VAD) to choose boundaries. `server_vad` object can be
            provided to tweak VAD detection parameters manually. If unset, the
            audio is transcribed as a single block.
        language:
          anyOf:
            - type: string
            - type: 'null'
          title: Language
          description: >-
            The language of the input audio. Supplying the input language in
            [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
            (e.g. `en`) format will improve accuracy and latency.
        stream:
          anyOf:
            - type: boolean
            - type: 'null'
          title: Stream
          description: >-
            Whether to stream the transcription result. When set to `true`, the
            transcription result will be streamed as [server-sent
            events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format)
            once generated.
        temperature:
          anyOf:
            - type: number
            - type: 'null'
          title: Temperature
          description: >-
            The sampling temperature, between 0 and 1. Higher values like 0.8
            will make the output more random, while lower values like 0.2 will
            make it more focused and deterministic.
      type: object
      required:
        - file
      title: ContainerAudioTranscriptionBody
      example:
        file: '@/path/to/file/audio.mp3'
    ContainerAudioTranscriptionSuccess:
      $ref: '#/components/schemas/AudioTranscriptionResult'
      title: ContainerAudioTranscriptionSuccess
    ServerVadChunkingStrategy:
      properties:
        type:
          type: string
          const: server_vad
          title: Type
          description: >-
            Must be set to `server_vad` to enable manual chunking using server
            side VAD.
        prefix_padding_ms:
          anyOf:
            - type: integer
            - type: 'null'
          title: Prefix Padding Ms
          description: >-
            Amount of audio to include before the VAD detected speech (in
            milliseconds).
          default: 300
        silence_duration_ms:
          anyOf:
            - type: integer
            - type: 'null'
          title: Silence Duration Ms
          description: >-
            Duration of silence to detect speech stop (in milliseconds). With
            shorter values the model will respond more quickly, but may jump in
            on short pauses from the user.
          default: 200
        threshold:
          anyOf:
            - type: number
            - type: 'null'
          title: Threshold
          description: >-
            Sensitivity threshold (0.0 to 1.0) for voice activity detection. A
            higher threshold will require louder audio to activate the model,
            and thus might perform better in noisy environments.
          default: 0.5
      type: object
      required:
        - type
      title: ServerVadChunkingStrategy
    AudioTranscriptionResult:
      properties:
        text:
          type: string
          title: Text
          description: The transcribed text.
        usage:
          $ref: '#/components/schemas/AudioTranscriptionUsage'
      type: object
      required:
        - text
        - usage
      title: AudioTranscriptionResult
    AudioTranscriptionUsage:
      properties:
        type:
          type: string
          const: tokens
          title: Type
          description: The type of the usage object. Always `tokens` for this variant.
        input_tokens:
          type: integer
          title: Input Tokens
          description: Number of input tokens billed for this request.
        output_tokens:
          type: integer
          title: Output Tokens
          description: Number of output tokens generated.
        total_tokens:
          type: integer
          title: Total Tokens
          description: Total number of tokens used (input + output).
        input_audio_length_ms:
          type: integer
          title: Input Audio Length Ms
          description: The length of the input audio in milliseconds.
        processed_audio_length_ms:
          type: integer
          title: Processed Audio Length Ms
          description: The length of the processed audio in milliseconds.
        input_token_details:
          anyOf:
            - $ref: '#/components/schemas/AudioTranscriptionInputTokenDetails'
            - type: 'null'
          description: Details about the input tokens billed for this request.
      type: object
      required:
        - type
        - input_tokens
        - output_tokens
        - total_tokens
        - input_audio_length_ms
        - processed_audio_length_ms
      title: AudioTranscriptionUsage
    AudioTranscriptionInputTokenDetails:
      properties:
        audio_tokens:
          type: integer
          title: Audio Tokens
          description: Number of audio tokens billed for this request.
        text_tokens:
          type: integer
          title: Text Tokens
          description: Number of text tokens billed for this request.
      type: object
      required:
        - audio_tokens
        - text_tokens
      title: AudioTranscriptionInputTokenDetails

````