Except for the SDK (only prediction is returned), the result will always be in the format :

{
  "prediction": <awaited result>,
  "prediction_raw": <raw result>
}

The awaited result is what we want when we use an endpoint. This is the data you will need in 99% of cases.

The raw result contains all the data relating to the transcription process. It can contain metadata, extra data, and/or the direct output from the model without any change on our side.

For example, the output of audio transcription using the example file is:

{
  "prediction": [
    {
      "time_begin": 0.09,
      "time_end": 2.07,
      "transcription": "Split infinity",
      "language": "en",
      "probability": 0.49,
      "speaker": "not_activated",
      "channel": "channel_0"
    },
    {
      "time_begin": 2.13,
      "time_end": 5.19,
      "transcription": "in a time when less is more",
      "language": "en",
      "probability": 0.65,
      "speaker": "not_activated",
      "channel": "channel_0"
    },
    {
      "time_begin": 5.52,
      "time_end": 20.4,
      "transcription": "Where too much is never enough, there is always hope for the future. The future can be read from the past. The past foreshadows the present, and the present hasn't been written yet",
      "language": "en",
      "probability": 0.75,
      "speaker": "not_activated",
      "channel": "channel_0"
    }
  ],
  "prediction_raw": {
    "metadata": {
      "total_speech_duration": 19.919999999999998,
      "total_speech_duration_channel_0": 19.919999999999998,
      "audio_conversion_time": 0.34188103675842285,
      "vad_time": 0.0009639263153076172,
      "inference_time": 1.6644258499145508,
      "diarization_time": 0.00000476837158203125,
      "total_transcription_time": 2.0072755813598633,
      "original_file_type": "audio",
      "original_nb_channels": 1,
      "original_sample_rate": 44100,
      "original_sample_width": 16,
      "original_nb_silent_channels": 0,
      "original_nb_similar_channels": 0,
      "original_mediainfo": {
        "index": "0",
        "codec_name": "pcm_s16le",
        "codec_long_name": "PCM signed 16-bit little-endian",
        "profile": "unknown",
        "codec_type": "audio",
        "codec_time_base": "1/44100",
        "codec_tag_string": "[1][0][0][0]",
        "codec_tag": "0x0001",
        "sample_fmt": "s16",
        "sample_rate": "44100",
        "channels": "1",
        "channel_layout": "unknown",
        "bits_per_sample": "16",
        "id": "N/A",
        "r_frame_rate": "0/0",
        "avg_frame_rate": "0/0",
        "time_base": "1/44100",
        "start_pts": "N/A",
        "start_time": "N/A",
        "duration_ts": "906496",
        "duration": "20.555465",
        "bit_rate": "705855",
        "max_bit_rate": "N/A",
        "bits_per_raw_sample": "N/A",
        "nb_frames": "N/A",
        "nb_read_frames": "N/A",
        "nb_read_packets": "N/A",
        "DISPOSITION": {
          "default": "0",
          "dub": "0",
          "original": "0",
          "comment": "0",
          "lyrics": "0",
          "karaoke": "0",
          "forced": "0",
          "hearing_impaired": "0",
          "visual_impaired": "0",
          "clean_effects": "0",
          "attached_pic": "0",
          "timed_thumbnails": "0"
        },
        "nb_streams": "1",
        "nb_programs": "0",
        "format_name": "wav",
        "format_long_name": "WAV / WAVE (Waveform Audio)",
        "size": "1813648",
        "probe_score": "99",
        "TAG": {
          "encoded_by": "tracktion",
          "date": "2009-08-02",
          "creation_time": "10:48:43",
          "time_reference": "2436484",
          "coding_history": ""
        }
      }
    },
    "transcription": [
      {
        "time_begin": 0.09,
        "time_end": 2.07,
        "transcription": "Split infinity",
        "language": "en",
        "probability": 0.49,
        "speaker": "not_activated",
        "channel": "channel_0"
      },
      {
        "time_begin": 2.13,
        "time_end": 5.19,
        "transcription": "in a time when less is more",
        "language": "en",
        "probability": 0.65,
        "speaker": "not_activated",
        "channel": "channel_0"
      },
      {
        "time_begin": 5.52,
        "time_end": 20.4,
        "transcription": "Where too much is never enough, there is always hope for the future. The future can be read from the past. The past foreshadows the present, and the present hasn't been written yet",
        "language": "en",
        "probability": 0.75,
        "speaker": "not_activated",
        "channel": "channel_0"
      }
    ]
  }
}