Fine-tuning Orpheus_(3B)-TTS with Unsloth
What is a TTS model?
What You'll Need
1
2
Loading the Model
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/orpheus-3b-0.1-ft",
max_seq_length = 2048, # Sequence length - leave this as is
dtype = None, # Auto-detect precision
load_in_4bit = False, # Set to True if you're low on VRAM
)
3
Setting Up LoRA
model = FastLanguageModel.get_peft_model(
model,
r = 64, # LoRA rank - higher = better quality but slower
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 64,
lora_dropout = 0,
bias = "none",
use_gradient_checkpointing = "unsloth", # Unsloth's magic - saves 30% VRAM!
random_state = 3407,
)##Unsloth 2026.2.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.4
Preparing Your Data (Important!)
🏁Data Format Requirements
from datasets import load_dataset
# Load the dataset
dataset = load_dataset("MrDragonFox/Elise", split="train")
# Or load from local files:
# dataset = load_dataset("json", data_files="your_data.json", split="train")🏁Audio Encoding (SNAC)
import torchaudio.transforms as T
from snac import SNAC
# Load the SNAC model
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
def tokenise_audio(waveform):
"""Convert audio waveform to tokens"""
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = waveform.to(dtype=torch.float32)
# Resample to 24kHz
resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
waveform = resample_transform(waveform).unsqueeze(0).to("cuda")
# Encode
with torch.inference_mode():
codes = snac_model.encode(waveform)
# Interleave codes in the correct order (this order matters!)
all_codes = []
for i in range(codes[0].shape[1]):
all_codes.append(codes[0][0][i].item() + 128266)
all_codes.append(codes[1][0][2*i].item() + 128266 + 4096)
all_codes.append(codes[2][0][4*i].item() + 128266 + (2*4096))
all_codes.append(codes[2][0][(4*i)+1].item() + 128266 + (3*4096))
all_codes.append(codes[1][0][(2*i)+1].item() + 128266 + (4*4096))
all_codes.append(codes[2][0][(4*i)+2].item() + 128266 + (5*4096))
all_codes.append(codes[2][0][(4*i)+3].item() + 128266 + (6*4096))
return all_codes
def add_codes(example):
codes_list = None
try:
answer_audio = example.get("audio")
if answer_audio and "array" in answer_audio:
audio_array = answer_audio["array"]
codes_list = tokenise_audio(audio_array)
except Exception as e:
print(f"Skipping row due to error: {e}")
example["codes_list"] = codes_list
return exampleds_sample_rate = dataset[0]["audio"]["sampling_rate"]
dataset = dataset.map(add_codes, remove_columns=["audio"])
# Filter out any failed conversions
dataset = dataset.filter(lambda x: x["codes_list"] is not None)
dataset = dataset.filter(lambda x: len(x["codes_list"]) > 0)🏁Removing Duplicate Frames
def remove_duplicate_frames(example):
"""Remove consecutive duplicate audio frames"""
vals = example["codes_list"]
if len(vals) % 7 != 0:
raise ValueError("Input list length must be divisible by 7")
result = vals[:7]
for i in range(7, len(vals), 7):
if vals[i] != result[-7]:
result.extend(vals[i:i+7])
example["codes_list"] = result
return example
dataset = dataset.map(remove_duplicate_frames)🏁Building Input Sequences
# Define special tokens
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009
start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2
start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4
start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
def create_input_ids(example):
# Single-speaker model
text_prompt = example['text']
# For multi-speaker, use:
# text_prompt = f"{example['source']}: {example['text']}"
text_ids = tokenizer.encode(text_prompt, add_special_tokens=True)
text_ids.append(end_of_text)
# Assemble the full sequence: [human start] [text] [human end] [ai start] [audio] [ai end]
input_ids = (
[start_of_human] +
text_ids +
[end_of_human] +
[start_of_ai] +
[start_of_speech] +
example["codes_list"] +
[end_of_speech] +
[end_of_ai]
)
example["input_ids"] = input_ids
example["labels"] = input_ids
example["attention_mask"] = [1] * len(input_ids)
return example
dataset = dataset.map(create_input_ids, remove_columns=["text", "codes_list"])5
Training Time
from transformers import TrainingArguments, Trainer
trainer = Trainer(
model = model,
train_dataset = dataset,
args = TrainingArguments(
per_device_train_batch_size = 1, # Batch size - be careful with >1 on multi-GPU
gradient_accumulation_steps = 4, # Gradient accumulation - effectively batch=4
warmup_steps = 5,
max_steps = 60, # Quick test - for real training use num_train_epochs=1
# num_train_epochs = 1, # Use this for full training
learning_rate = 2e-4,
logging_steps = 1,
optim = "adamw_8bit", # 8-bit optimizer saves memory
weight_decay = 0.001,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
),
)gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")trainer_stats = trainer.train()

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"Training time: {round(trainer_stats.metrics['train_runtime']/60, 2)} minutes")
print(f"Peak memory usage: {used_memory} GB")6
Testing Your Model
FastLanguageModel.for_inference(model) # Switch to inference mode
snac_model.to("cpu") # Move SNAC to CPU to save VRAM
# Prepare test prompts
prompts = [
"Hey there my name is Elise, and I'm a speech generation model.",
"This is a test of the fine-tuned voice synthesis system.",
]
chosen_voice = None # None for single-speaker, or speaker name for multi-speaker# Prepare prompts with speaker prefix if needed
prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
all_input_ids = []
for prompt in prompts_:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
all_input_ids.append(input_ids)
# Add special tokens
start_token = torch.tensor([[128259]], dtype=torch.int64) # Start of human
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64) # End of text, End of human
all_modified_input_ids = []
for input_ids in all_input_ids:
modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
all_modified_input_ids.append(modified_input_ids)
# Pad sequences
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
all_padded_tensors = []
all_attention_masks = []
for modified_input_ids in all_modified_input_ids:
padding = max_length - modified_input_ids.shape[1]
padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
all_padded_tensors.append(padded_tensor)
all_attention_masks.append(attention_mask)
input_ids = torch.cat(all_padded_tensors, dim=0).to("cuda")
attention_mask = torch.cat(all_attention_masks, dim=0).to("cuda")
# Generate!
generated_ids = model.generate(
input_ids = input_ids,
attention_mask = attention_mask,
max_new_tokens = 1200,
do_sample = True,
temperature = 0.6,
top_p = 0.95,
repetition_penalty = 1.1,
num_return_sequences = 1,
eos_token_id = 128258,
use_cache = True
)
# Extract audio tokens
token_to_find = 128257
token_to_remove = 128258
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_occurrence_idx = token_indices[1][-1].item()
cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
cropped_tensor = generated_ids
# Process tokens
processed_rows = []
for row in cropped_tensor:
masked_row = row[row != token_to_remove]
processed_rows.append(masked_row)
code_lists = []
for row in processed_rows:
row_length = row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = row[:new_length]
trimmed_row = [t - 128266 for t in trimmed_row]
code_lists.append(trimmed_row)
# Decode audio
def redistribute_codes(code_list):
layer_1 = []
layer_2 = []
layer_3 = []
for i in range(len(code_list) // 7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1] - 4096)
layer_3.append(code_list[7*i+2] - (2*4096))
layer_3.append(code_list[7*i+3] - (3*4096))
layer_2.append(code_list[7*i+4] - (4*4096))
layer_3.append(code_list[7*i+5] - (5*4096))
layer_3.append(code_list[7*i+6] - (6*4096))
# Validate and clip codes to valid range (0-4095)
layer_1 = [max(0, min(4095, x)) for x in layer_1]
layer_2 = [max(0, min(4095, x)) for x in layer_2]
layer_3 = [max(0, min(4095, x)) for x in layer_3]
codes = [torch.tensor(layer_1).unsqueeze(0),
torch.tensor(layer_2).unsqueeze(0),
torch.tensor(layer_3).unsqueeze(0)]
audio_hat = snac_model.decode(codes)
return audio_hat
my_samples = []
for code_list in code_lists:
try:
samples = redistribute_codes(code_list)
my_samples.append(samples)
except Exception as e:
print(f"Error decoding audio: {e}")
# Add empty sample as placeholder
my_samples.append(torch.zeros(1, 1, 24000))
# Save and play the audio!
import scipy.io.wavfile as wavfile
from IPython.display import display, Audio
import os
# Create output directory if it doesn't exist
os.makedirs("generated_audio", exist_ok=True)
for i in range(len(my_samples)):
print(f"\n{i+1}. {prompts[i]}")
samples = my_samples[i]
# Convert to numpy array
audio_array = samples.detach().squeeze().to("cpu").numpy()
# Save as WAV file
output_path = f"generated_audio/output_{i+1}.wav"
wavfile.write(output_path, 24000, audio_array)
print(f" ✓ Saved to: {output_path}")
# Try to display audio player (works in Jupyter)
try:
display(Audio(audio_array, rate=24000))
except:
print(f" → Play the audio file directly: {output_path}")
❔FAQ
Q: Running out of memory?
Q: Training is super slow?
Q: Results aren't great?
Q: Multi-GPU errors?
Q: Can I train on non-English data?
Advanced Tips
Happy training! 🎉
Last updated
Was this helpful?