MARS6 is a frontier text-to-speech model by CAMB.AI with voice/prosody cloning capabilities in 10 languages. MARS6 must be licensed for commercial use, we can help!
Deploy MARS6 behind an API endpoint in seconds.
Example usage
This model requires at least four inputs:
: The input text that needs to be spokenaudio_ref
: An audio file containing the audio of a single personref_text
: What is spoken inaudio_ref
The language code for the target language
The model will try to output an audio file containing the speech in the reference audio's style. The output is a base64 string so it needs to get converted to an audio format before it can be played.
data = {"text": "The quick brown fox jumps over the lazy dog",
"audio_ref": encoded_str,
"ref_text": prompt_txt,
"language": 'en-us', # Target language, in this case english.
# "top_p": 0.7, # Optionally specify a top_p (default 0.7)
# "temperature": 0.7, # Optionally specify a temperature (default 0.7)
# "chunk_length": 200, # Optional text chunk length for splitting long pieces of input text. Default 200
# "max_new_tokens": 0, # Optional limit on max number of new tokens, default is zero (unlimited)
# "repetition_penalty": 1.5 # Optional rep penalty, default 1.5
1import httpx
2import base64
3import time
4import torchaudio
5import IPython.display as ipd
6import librosa, librosa.display
7import torch
8import io
10# Step 1: set endpoint url and api key:
12headers = {"Authorization": "Api-Key <YOUR API KEY>"}
15# Step 2: pick reference audio to clone, encode it as base64
16file_path = 'ref_debug.flac' # any valid audio filepath, ideally between 6s-90s.
17wav, sr = librosa.load(file_path, sr=None, mono=True, offset=0, duration=5)
18io_data = io.BytesIO(), torch.from_numpy(wav)[None], sample_rate=sr, format='wav')
21encoded_data = base64.b64encode(
22encoded_str = encoded_data.decode("utf-8")
23# OPTIONAL: specify the transcript of the reference/prompt (slightly speeds up inference, and may make it sound a bit better).
24prompt_txt = None # if unspecified, can be left as None
26# Step 3: define other inference settings:
27data = {"text": "The quick brown fox jumps over the lazy dog",
28 "audio_ref": encoded_str,
29 "ref_text": prompt_txt,
30 "language": 'en-us', # Target language, in this case english.
31 # "top_p": 0.7, # Optionally specify a top_p (default 0.7)
32 # "temperature": 0.7, # Optionally specify a temperature (default 0.7)
33 # "chunk_length": 200, # Optional text chunk length for splitting long pieces of input text. Default 200
34 # "max_new_tokens": 0, # Optional limit on max number of new tokens, default is zero (unlimited)
35 # "repetition_penalty": 1.5 # Optional rep penalty, default 1.5
38# Step 4: Send the POST request (note the first request might be a bit slow, but following requests should be fast)
39st = time.time()
40response =, headers=headers, json=data, timeout=120)
41et = time.time()
43print(f"Runtime: {et-st:.2f} seconds")
44# Check the response status code
45if response.status_code == 200: print("Request successful!")
46else: print("Request failed with status code", response.status_code, response.content)
48# Step 5: decode base64 output back to audio
49wav, sr = torchaudio.load(base64.b64decode(response.json()['result'])), rate=sr))
2 "result": "iVBORw0KGgoAAAANSUhEU"