Why Batch Text-to-Speech?
If you need to convert large amounts of text to audio, doing it manually is painfully slow:
Lovo AI offers one of the most natural-sounding TTS APIs available. In this tutorial, we will build a Python script that batch processes text files into professional audio.
What We Are Building
A command-line tool that:
1. Takes a folder of `.txt` files
2. Converts each to speech using Lovo AI
3. Saves the audio files with proper naming
4. Optionally merges them into a single audio file
5. Generates a JSON manifest with timestamps
Prerequisites
Step 1: Project Setup
mkdir lovo-batch-tts
cd lovo-batch-tts
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
pip install requests pydub python-dotenvCreate a `.env` file:
LOVO_API_KEY=your_api_key_hereStep 2: The Lovo API Client
Create `lovo_client.py`:
import os
import time
import requests
from typing import Optional, List, Dict
from dataclasses import dataclass
from dotenv import load_dotenv
load_dotenv()
API_BASE = "https://api.genny.lovo.ai/api/v1"
API_KEY = os.getenv("LOVO_API_KEY")
@dataclass
class Voice:
id: str
display_name: str
locale: str
gender: str
@dataclass
class TTSResult:
audio_url: str
duration_ms: int
word_timestamps: List[Dict]
def get_headers() -> dict:
return {
"X-API-KEY": API_KEY,
"Content-Type": "application/json"
}
def list_voices(locale: Optional[str] = None) -> List[Voice]:
"""Get available voices, optionally filtered by locale."""
response = requests.get(
f"{API_BASE}/speakers",
headers=get_headers()
)
response.raise_for_status()
voices = []
for v in response.json()["data"]:
if locale is None or v["locale"].startswith(locale):
voices.append(Voice(
id=v["id"],
display_name=v["displayName"],
locale=v["locale"],
gender=v.get("gender", "unknown")
))
return voices
def generate_speech(
text: str,
voice_id: str,
speed: float = 1.0,
pitch: float = 1.0
) -> TTSResult:
"""Generate speech from text using specified voice."""
# Step 1: Create TTS job
create_response = requests.post(
f"{API_BASE}/tts",
headers=get_headers(),
json={
"speaker": voice_id,
"text": text,
"speed": speed,
"pitch": pitch
}
)
create_response.raise_for_status()
job_id = create_response.json()["id"]
# Step 2: Poll for completion
max_attempts = 60 # 60 seconds max wait
for _ in range(max_attempts):
status_response = requests.get(
f"{API_BASE}/tts/{job_id}",
headers=get_headers()
)
status_response.raise_for_status()
data = status_response.json()
if data["status"] == "succeeded":
return TTSResult(
audio_url=data["urls"][0],
duration_ms=data.get("duration", 0),
word_timestamps=data.get("wordTimestamps", [])
)
elif data["status"] == "failed":
raise Exception(f"TTS generation failed: {data.get('error', 'Unknown error')}")
time.sleep(1)
raise Exception("TTS generation timed out")
def download_audio(url: str, output_path: str) -> str:
"""Download audio file from URL."""
response = requests.get(url)
response.raise_for_status()
with open(output_path, "wb") as f:
f.write(response.content)
return output_pathStep 3: The Batch Processor
Create `batch_tts.py`:
import os
import json
import argparse
from pathlib import Path
from typing import List, Dict
from dataclasses import dataclass, asdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from lovo_client import list_voices, generate_speech, download_audio
@dataclass
class ProcessedFile:
source_file: str
audio_file: str
duration_ms: int
word_count: int
success: bool
error: Optional[str] = None
def get_text_files(input_dir: str) -> List[Path]:
"""Get all .txt files from directory, sorted by name."""
input_path = Path(input_dir)
files = list(input_path.glob("*.txt"))
return sorted(files, key=lambda f: f.name)
def process_file(
file_path: Path,
voice_id: str,
output_dir: Path,
speed: float = 1.0
) -> ProcessedFile:
"""Process a single text file to audio."""
try:
# Read text content
with open(file_path, "r", encoding="utf-8") as f:
text = f.read().strip()
if not text:
return ProcessedFile(
source_file=str(file_path),
audio_file="",
duration_ms=0,
word_count=0,
success=False,
error="Empty file"
)
# Generate speech
result = generate_speech(text, voice_id, speed=speed)
# Download audio
output_filename = file_path.stem + ".mp3"
output_path = output_dir / output_filename
download_audio(result.audio_url, str(output_path))
return ProcessedFile(
source_file=str(file_path),
audio_file=str(output_path),
duration_ms=result.duration_ms,
word_count=len(text.split()),
success=True
)
except Exception as e:
return ProcessedFile(
source_file=str(file_path),
audio_file="",
duration_ms=0,
word_count=0,
success=False,
error=str(e)
)
def batch_process(
input_dir: str,
output_dir: str,
voice_id: str,
speed: float = 1.0,
parallel: int = 3
) -> List[ProcessedFile]:
"""Process all text files in a directory."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
text_files = get_text_files(input_dir)
print(f"Found {len(text_files)} text files to process")
results: List[ProcessedFile] = []
# Process files in parallel
with ThreadPoolExecutor(max_workers=parallel) as executor:
futures = {
executor.submit(
process_file, f, voice_id, output_path, speed
): f for f in text_files
}
for future in as_completed(futures):
file_path = futures[future]
result = future.result()
results.append(result)
if result.success:
print(f"Processed: {file_path.name} ({result.duration_ms}ms)")
else:
print(f"Failed: {file_path.name} - {result.error}")
# Sort results by original file order
results.sort(key=lambda r: r.source_file)
return results
def merge_audio_files(
results: List[ProcessedFile],
output_path: str
) -> str:
"""Merge all audio files into a single file."""
from pydub import AudioSegment
combined = AudioSegment.empty()
# Add 500ms silence between segments
silence = AudioSegment.silent(duration=500)
for result in results:
if result.success and result.audio_file:
audio = AudioSegment.from_mp3(result.audio_file)
combined += audio + silence
combined.export(output_path, format="mp3")
print(f"Merged audio saved to: {output_path}")
return output_path
def save_manifest(results: List[ProcessedFile], output_path: str):
"""Save processing results as JSON manifest."""
manifest = {
"total_files": len(results),
"successful": sum(1 for r in results if r.success),
"failed": sum(1 for r in results if not r.success),
"total_duration_ms": sum(r.duration_ms for r in results),
"total_words": sum(r.word_count for r in results),
"files": [asdict(r) for r in results]
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(manifest, f, indent=2)
print(f"Manifest saved to: {output_path}")
def main():
parser = argparse.ArgumentParser(
description="Batch convert text files to speech using Lovo AI"
)
parser.add_argument("input_dir", help="Directory containing .txt files")
parser.add_argument("output_dir", help="Directory for output audio files")
parser.add_argument("--voice", help="Voice ID to use (default: first en-US voice)")
parser.add_argument("--speed", type=float, default=1.0, help="Speech speed (0.5-2.0)")
parser.add_argument("--parallel", type=int, default=3, help="Parallel workers")
parser.add_argument("--merge", action="store_true", help="Merge all audio into one file")
parser.add_argument("--list-voices", action="store_true", help="List available voices and exit")
args = parser.parse_args()
# List voices mode
if args.list_voices:
print("Available voices:")
print("-" * 60)
for voice in list_voices():
print(f"{voice.id}: {voice.display_name} ({voice.locale}, {voice.gender})")
return
# Get voice ID
voice_id = args.voice
if not voice_id:
voices = list_voices("en-US")
if not voices:
voices = list_voices()
voice_id = voices[0].id
print(f"Using voice: {voices[0].display_name}")
# Process files
results = batch_process(
args.input_dir,
args.output_dir,
voice_id,
speed=args.speed,
parallel=args.parallel
)
# Save manifest
manifest_path = Path(args.output_dir) / "manifest.json"
save_manifest(results, str(manifest_path))
# Optionally merge
if args.merge:
merged_path = Path(args.output_dir) / "merged.mp3"
merge_audio_files(results, str(merged_path))
# Summary
print("\n" + "=" * 60)
print(f"Processed: {len(results)} files")
print(f"Successful: {sum(1 for r in results if r.success)}")
print(f"Failed: {sum(1 for r in results if not r.success)}")
total_ms = sum(r.duration_ms for r in results)
print(f"Total duration: {total_ms // 1000 // 60}m {(total_ms // 1000) % 60}s")
if __name__ == "__main__":
main()Step 4: Usage Examples
List Available Voices
python batch_tts.py --list-voicesOutput:
Available voices:
------------------------------------------------------------
voice_abc123: James (en-US, male)
voice_def456: Sarah (en-US, female)
voice_ghi789: Emma (en-GB, female)
...Convert a Folder of Text Files
python batch_tts.py ./chapters ./audiobook --voice voice_abc123Convert with Custom Speed and Merge
python batch_tts.py ./scripts ./audio --speed 0.9 --merge --parallel 5Example Input Structure
chapters/
01-introduction.txt
02-getting-started.txt
03-advanced-topics.txt
04-conclusion.txtExample Output
audiobook/
01-introduction.mp3
02-getting-started.mp3
03-advanced-topics.mp3
04-conclusion.mp3
merged.mp3
manifest.jsonStep 5: The Manifest File
The script generates a `manifest.json` with metadata:
{
"total_files": 4,
"successful": 4,
"failed": 0,
"total_duration_ms": 342500,
"total_words": 2847,
"files": [
{
"source_file": "chapters/01-introduction.txt",
"audio_file": "audiobook/01-introduction.mp3",
"duration_ms": 85200,
"word_count": 712,
"success": true,
"error": null
}
]
}This is useful for:
Advanced: Adding SSML Support
For more control over pronunciation, add SSML support:
def text_to_ssml(text: str) -> str:
"""Convert plain text to SSML with basic enhancements."""
# Add pauses after periods
text = text.replace(". ", '. <break time="300ms"/> ')
# Add emphasis to quoted text
text = re.sub(
r'"([^"]+)"',
r'<emphasis level="moderate">"\1"</emphasis>',
text
)
# Wrap in SSML tags
return f'<speak>{text}</speak>'Cost Estimation
Lovo AI pricing is based on character count:
| Content | Characters | Estimated Cost |
|---|---|---|
| Blog post (1000 words) | ~6,000 | ~$0.30 |
| Book chapter (5000 words) | ~30,000 | ~$1.50 |
| Full audiobook (50,000 words) | ~300,000 | ~$15.00 |
Compare to professional voice actors: $200-500 per finished hour.
Troubleshooting
Rate Limiting
If you hit rate limits, reduce parallel workers:
python batch_tts.py ./input ./output --parallel 1Large Files
For very long text files, split them first:
def split_text(text: str, max_chars: int = 5000) -> List[str]:
"""Split text into chunks at sentence boundaries."""
sentences = text.replace('\n', ' ').split('. ')
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_chars:
current_chunk += sentence + ". "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return chunksConclusion
You now have a production-ready Python tool for batch text-to-speech conversion. The script handles:
Use it for audiobooks, course content, video narration, or any project that needs natural-sounding voiceovers at scale.
Get your Lovo AI API key: genny.lovo.ai