edge-tts批量语音生成:大规模语音数据集创建自动化方案
在人工智能语音技术快速发展的今天,高质量语音数据集的需求日益增长。然而,传统语音数据采集面临诸多挑战:- **成本高昂**:专业录音设备和配音人员费用不菲- **时间消耗**:人工录制和后期处理耗时漫长- **一致性差**:不同录音环境和发音人导致质量参差不齐- **规模限制**:难以快速生成大规模多样化数据edge-tts作为微软Edge在线语音服务的Python封装,为解决这些痛...
·
edge-tts批量语音生成:大规模语音数据集创建自动化方案
引言:语音数据集创建的痛点与挑战
在人工智能语音技术快速发展的今天,高质量语音数据集的需求日益增长。然而,传统语音数据采集面临诸多挑战:
- 成本高昂:专业录音设备和配音人员费用不菲
- 时间消耗:人工录制和后期处理耗时漫长
- 一致性差:不同录音环境和发音人导致质量参差不齐
- 规模限制:难以快速生成大规模多样化数据
edge-tts作为微软Edge在线语音服务的Python封装,为解决这些痛点提供了革命性的自动化方案。本文将深入探讨如何利用edge-tts实现大规模语音数据集的批量生成。
edge-tts核心功能解析
技术架构概览
关键特性优势
| 特性 | 优势 | 应用场景 |
|---|---|---|
| 无需API密钥 | 零成本使用 | 学术研究、个人项目 |
| 多语言支持 | 全球语言覆盖 | 多语言语音合成 |
| 高质量音频 | 24kHz 48kbps MP3 | 专业语音数据集 |
| 实时字幕生成 | 自动时间戳标注 | 语音识别训练 |
| 参数可配置 | 音调、语速、音量调节 | 多样化数据生成 |
批量语音生成系统设计
系统架构设计
# 批量语音生成系统核心类设计
class BatchTTSGenerator:
"""大规模语音数据集批量生成器"""
def __init__(self, config: Dict):
self.config = config
self.voice_manager = None
self.output_dir = Path(config['output_dir'])
self.output_dir.mkdir(exist_ok=True)
async def initialize_voices(self):
"""初始化语音管理器"""
self.voice_manager = await VoicesManager.create()
def load_text_corpus(self, corpus_path: str) -> List[Dict]:
"""加载文本语料库"""
# 支持多种格式:TXT、JSON、CSV
pass
async def generate_batch(self, texts: List[Dict],
voice_config: Dict) -> List[Path]:
"""批量生成语音文件"""
tasks = []
for text_item in texts:
task = self._generate_single(text_item, voice_config)
tasks.append(task)
results = await asyncio.gather(*tasks, return_exceptions=True)
return [r for r in results if isinstance(r, Path)]
async def _generate_single(self, text_item: Dict,
voice_config: Dict) -> Path:
"""单条文本语音生成"""
communicate = Communicate(
text=text_item['content'],
voice=voice_config['voice'],
rate=voice_config.get('rate', '+0%'),
volume=voice_config.get('volume', '+0%'),
pitch=voice_config.get('pitch', '+0Hz')
)
output_path = self.output_dir / f"{text_item['id']}.mp3"
subtitle_path = self.output_dir / f"{text_item['id']}.srt"
await communicate.save(str(output_path), str(subtitle_path))
return output_path
并发处理优化策略
实战:万级语音数据集创建
环境配置与安装
# 创建虚拟环境
python -m venv tts-env
source tts-env/bin/activate
# 安装edge-tts
pip install edge-tts
# 安装额外依赖
pip install aiohttp asyncio pandas tqdm
完整批量生成脚本
#!/usr/bin/env python3
"""
大规模语音数据集批量生成脚本
支持万级别文本的自动化语音合成
"""
import asyncio
import json
import csv
from pathlib import Path
from typing import List, Dict, Any
import aiohttp
from tqdm import tqdm
import edge_tts
from edge_tts import Communicate, VoicesManager
class MassTTSDatasetCreator:
"""大规模TTS数据集创建器"""
def __init__(self, output_base: str = "./tts_dataset"):
self.output_base = Path(output_base)
self.output_base.mkdir(exist_ok=True)
# 创建子目录
self.audio_dir = self.output_base / "audio"
self.subtitle_dir = self.output_base / "subtitles"
self.metadata_dir = self.output_base / "metadata"
for dir_path in [self.audio_dir, self.subtitle_dir, self.metadata_dir]:
dir_path.mkdir(exist_ok=True)
async def get_available_voices(self) -> List[Dict]:
"""获取所有可用语音列表"""
voices = await edge_tts.list_voices()
return voices
def load_text_corpus(self, corpus_path: str) -> List[Dict]:
"""加载文本语料库"""
corpus_path = Path(corpus_path)
if corpus_path.suffix == '.json':
with open(corpus_path, 'r', encoding='utf-8') as f:
return json.load(f)
elif corpus_path.suffix == '.csv':
texts = []
with open(corpus_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
texts.append(row)
return texts
elif corpus_path.suffix == '.txt':
with open(corpus_path, 'r', encoding='utf-8') as f:
return [{"id": f"text_{i}", "content": line.strip()}
for i, line in enumerate(f.readlines()) if line.strip()]
else:
raise ValueError("Unsupported corpus format")
async def generate_dataset(self, corpus_path: str,
voice_configs: List[Dict],
batch_size: int = 100,
max_concurrent: int = 10) -> Dict[str, Any]:
"""
生成完整语音数据集
Args:
corpus_path: 文本语料库路径
voice_configs: 语音配置列表
batch_size: 批处理大小
max_concurrent: 最大并发数
Returns:
生成统计信息
"""
# 加载文本语料
texts = self.load_text_corpus(corpus_path)
total_texts = len(texts)
print(f"加载 {total_texts} 条文本,使用 {len(voice_configs)} 种语音配置")
# 信号量控制并发
semaphore = asyncio.Semaphore(max_concurrent)
results = {
"total_tasks": total_texts * len(voice_configs),
"successful": 0,
"failed": 0,
"generated_files": 0
}
# 创建进度条
pbar = tqdm(total=results["total_tasks"],
desc="生成语音数据", unit="task")
async def process_item(text_item: Dict, voice_config: Dict):
"""处理单个文本语音生成任务"""
async with semaphore:
try:
text_id = text_item['id']
voice_name = voice_config['voice']
# 生成唯一文件名
base_name = f"{text_id}_{voice_name}"
audio_file = self.audio_dir / f"{base_name}.mp3"
subtitle_file = self.subtitle_dir / f"{base_name}.srt"
# 创建TTS实例
communicate = Communicate(
text=text_item['content'],
voice=voice_name,
rate=voice_config.get('rate', '+0%'),
volume=voice_config.get('volume', '+0%'),
pitch=voice_config.get('pitch', '+0Hz')
)
# 生成语音和字幕
await communicate.save(
str(audio_file),
str(subtitle_file)
)
# 更新统计
results["successful"] += 1
results["generated_files"] += 2
except Exception as e:
results["failed"] += 1
print(f"处理失败: {text_item['id']} - {voice_name}: {str(e)}")
finally:
pbar.update(1)
# 创建所有任务
tasks = []
for text_item in texts:
for voice_config in voice_configs:
task = process_item(text_item, voice_config)
tasks.append(task)
# 并发执行
await asyncio.gather(*tasks)
pbar.close()
return results
def create_dataset_manifest(self):
"""创建数据集清单文件"""
manifest = {
"dataset_info": {
"name": "Edge-TTS Generated Dataset",
"version": "1.0",
"created_date": "2025-08-29",
"total_samples": 0,
"languages": [],
"voices": []
},
"samples": []
}
# 收集音频文件信息
audio_files = list(self.audio_dir.glob("*.mp3"))
for audio_file in audio_files:
# 解析文件名获取信息
parts = audio_file.stem.split('_')
if len(parts) >= 2:
text_id = parts[0]
voice_name = '_'.join(parts[1:])
subtitle_file = self.subtitle_dir / f"{audio_file.stem}.srt"
manifest["samples"].append({
"text_id": text_id,
"voice": voice_name,
"audio_path": str(audio_file.relative_to(self.output_base)),
"subtitle_path": str(subtitle_file.relative_to(self.output_base)) if subtitle_file.exists() else None
})
manifest["dataset_info"]["total_samples"] = len(manifest["samples"])
manifest["dataset_info"]["voices"] = list(set(
sample["voice"] for sample in manifest["samples"]
))
# 保存清单文件
manifest_file = self.metadata_dir / "dataset_manifest.json"
with open(manifest_file, 'w', encoding='utf-8') as f:
json.dump(manifest, f, ensure_ascii=False, indent=2)
return manifest
async def main():
"""主函数:演示万级语音数据集生成"""
# 初始化生成器
creator = MassTTSDatasetCreator("./large_tts_dataset")
# 定义语音配置(示例)
voice_configs = [
{"voice": "en-US-AriaNeural", "rate": "+0%", "volume": "+0%"},
{"voice": "en-GB-SoniaNeural", "rate": "+10%", "volume": "-5%"},
{"voice": "zh-CN-XiaoxiaoNeural", "rate": "+0%", "volume": "+0%"},
{"voice": "ja-JP-NanamiNeural", "rate": "-5%", "volume": "+0%"}
]
# 生成示例文本语料库
sample_texts = [
{"id": "sample_1", "content": "Hello, this is a test sentence for TTS generation."},
{"id": "sample_2", "content": "语音合成技术正在改变人机交互的方式。"},
{"id": "sample_3", "content": "人工智能は現代社会に革命をもたらしています。"}
]
# 保存示例语料库
with open("sample_corpus.json", 'w', encoding='utf-8') as f:
json.dump(sample_texts, f, ensure_ascii=False, indent=2)
# 生成数据集
results = await creator.generate_dataset(
"sample_corpus.json",
voice_configs,
max_concurrent=5
)
print(f"\n生成完成!")
print(f"成功: {results['successful']}")
print(f"失败: {results['failed']}")
print(f"生成文件: {results['generated_files']}")
# 创建数据集清单
manifest = creator.create_dataset_manifest()
print(f"数据集包含 {manifest['dataset_info']['total_samples']} 个样本")
print(f"支持语音: {', '.join(manifest['dataset_info']['voices'])}")
if __name__ == "__main__":
asyncio.run(main())
性能优化与错误处理
质量保障与验证体系
自动化质量检测
火山引擎开发者社区是火山引擎打造的AI技术生态平台,聚焦Agent与大模型开发,提供豆包系列模型(图像/视频/视觉)、智能分析与会话工具,并配套评测集、动手实验室及行业案例库。社区通过技术沙龙、挑战赛等活动促进开发者成长,新用户可领50万Tokens权益,助力构建智能应用。
更多推荐
所有评论(0)