Skip to content

Audio

This quickstart walks you through generating your first audio with Siraya AI.

Text-to-speech creation

Generates audio from the input text.

curl https://audio.siraya.pro/v1/audio/speech \
    -H "Content-Type: application/json" \
    -H "Authorization: <API_KEY>" \
    -d '{
    "model": "openai/gpt-4o-mini-tts",
    "input": "A cute baby sea otter",
    "voice": "alloy"
  }' \
  --output speech.mp3

import os
import json
import requests

API_URL = "https://audio.siraya.pro/v1/audio/speech"
API_KEY = os.getenv("ONEROUTER_API_KEY")

if not API_KEY:
    raise RuntimeError("Please set the ONEROUTER_API_KEY")

payload = {
    "model": "openai/gpt-4o-mini-tts",
    "input": "A cute baby sea otter.",
    "voice": "alloy",
    "response_format": "mp3"
}

headers = {
    "Authorization": API_KEY,
    "Content-Type": "application/json"
}

response = requests.post(API_URL, headers=headers, data=json.dumps(payload))
response.raise_for_status()

out_path = os.path.join(os.path.dirname(__file__), "tts-output.mp3")
with open(out_path, "wb") as f:
    f.write(response.content)

print(f"Saved to: {out_path}")
  • <API_KEY> is your API Key generated in API page.
  • model is the model name, such as gpt-4o-mini-tts, available model list can be access in Model page.
  • The voice to use when generating the audio. Supported voices are alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse.

Example response

Info

The audio file content.

Speech-to-text translation

Translates audio into English.

curl https://audio.siraya.pro/v1/audio/translations \
    -H "Content-Type: multipart/form-data" \
    -H "Authorization: <API_KEY>" \
    --form 'file=@/path/to/file/speech.m4a' \
    --form 'model="openai/whisper-1"'
from openai import OpenAI
import base64
import os

client = OpenAI(
    api_key="<API_KEY>", # Replace with your Key "sk-***"
    base_url="https://audio.siraya.pro/v1"
)

audio_file = open("/path/to/file/speech.m4a", "rb")

transcript = client.audio.translations.create(
  model="openai/whisper-1",
  file=audio_file
)

print(transcript)
import os
import requests

API_URL = "https://audio.siraya.pro/v1/audio/translations"
API_KEY = os.getenv("ONEROUTER_API_KEY")

if not API_KEY:
    raise RuntimeError("Please set the ONEROUTER_API_KEY")

file_path = os.path.join(os.path.dirname(__file__), "tts-output.mp3")

with open(file_path, "rb") as f:
    files = {"file": ("tts-output.mp3", f, "audio/mpeg")}
    data = {
        "model": "openai/whisper-1",
        "prompt": "loudly",
        "response_format": "json",
        "temperature": 0.8,
    }
    headers = {"Authorization": API_KEY}

    response = requests.post(API_URL, headers=headers, files=files, data=data)
    response.raise_for_status()
    print(response.json())
  • <API_KEY> is your API Key generated in API page.
  • model is the model name, such as whisper-1, available model list can be access in Model page.
  • file is the audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

Example response

{
  "text": "Hello, my name is Wolfgang and I come from Germany. Where are you heading today?"
}

Speech-to-text transcription

Transcribes audio into the input language.

curl https://audio.siraya.pro/v1/speech/transcriptions \
    -H "Content-Type: multipart/form-data" \
    -H "Authorization: <API_KEY>" \
    --form 'file=@/path/to/file/speech.m4a' \
    --form 'model="openai/gpt-4o-transcribe"'

from openai import OpenAI
import base64
import os

client = OpenAI(
    api_key="<API_KEY>", # Replace with your Key "sk-***"
    base_url="https://audio.siraya.pro/v1"
)

audio_file = open("/path/to/file/speech.m4a", "rb")

transcription = client.audio.transcriptions.create(
  model="openai/gpt-4o-transcribe",
  file=audio_file
)

print(transcription)
import os
import requests

API_URL = "https://audio.siraya.pro/v1/audio/transcriptions"
API_KEY = os.getenv("ONEROUTER_API_KEY")

if not API_KEY:
    raise RuntimeError("Please set the ONEROUTER_API_KEY")

file_path = os.path.join(os.path.dirname(__file__), "tts-output.mp3")

with open(file_path, "rb") as f:
    files = {"file": ("tts-output.mp3", f, "audio/mpeg")}
    data = {
        "model": "openai/gpt-4o-transcribe",
        "prompt": "loudly",
        "response_format": "json",
        "temperature": 0.8,
    }
    headers = {"Authorization": API_KEY}

    response = requests.post(API_URL, headers=headers, files=files, data=data)
    response.raise_for_status()
    print(response.json())
  • <API_KEY> is your API Key generated in API page.
  • model is the model name, such as whisper-1, available model list can be access in Model page.
  • file is the audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.

Example response

{
  "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger. This is a place where you can get to do that.",
  "usage": {
    "type": "tokens",
    "input_tokens": 14,
    "input_token_details": {
      "text_tokens": 0,
      "audio_tokens": 14
    },
    "output_tokens": 45,
    "total_tokens": 59
  }
}