edge-tts批量语音生成:大规模语音数据集创建自动化方案

【免费下载链接】edge-tts Use Microsoft Edge's online text-to-speech service from Python WITHOUT needing Microsoft Edge or Windows or an API key 【免费下载链接】edge-tts 项目地址: https://gitcode.com/GitHub_Trending/ed/edge-tts

引言:语音数据集创建的痛点与挑战

在人工智能语音技术快速发展的今天,高质量语音数据集的需求日益增长。然而,传统语音数据采集面临诸多挑战:

  • 成本高昂:专业录音设备和配音人员费用不菲
  • 时间消耗:人工录制和后期处理耗时漫长
  • 一致性差:不同录音环境和发音人导致质量参差不齐
  • 规模限制:难以快速生成大规模多样化数据

edge-tts作为微软Edge在线语音服务的Python封装,为解决这些痛点提供了革命性的自动化方案。本文将深入探讨如何利用edge-tts实现大规模语音数据集的批量生成。

edge-tts核心功能解析

技术架构概览

mermaid

关键特性优势

特性 优势 应用场景
无需API密钥 零成本使用 学术研究、个人项目
多语言支持 全球语言覆盖 多语言语音合成
高质量音频 24kHz 48kbps MP3 专业语音数据集
实时字幕生成 自动时间戳标注 语音识别训练
参数可配置 音调、语速、音量调节 多样化数据生成

批量语音生成系统设计

系统架构设计

# 批量语音生成系统核心类设计
class BatchTTSGenerator:
    """大规模语音数据集批量生成器"""
    
    def __init__(self, config: Dict):
        self.config = config
        self.voice_manager = None
        self.output_dir = Path(config['output_dir'])
        self.output_dir.mkdir(exist_ok=True)
    
    async def initialize_voices(self):
        """初始化语音管理器"""
        self.voice_manager = await VoicesManager.create()
    
    def load_text_corpus(self, corpus_path: str) -> List[Dict]:
        """加载文本语料库"""
        # 支持多种格式:TXT、JSON、CSV
        pass
    
    async def generate_batch(self, texts: List[Dict], 
                           voice_config: Dict) -> List[Path]:
        """批量生成语音文件"""
        tasks = []
        for text_item in texts:
            task = self._generate_single(text_item, voice_config)
            tasks.append(task)
        
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return [r for r in results if isinstance(r, Path)]
    
    async def _generate_single(self, text_item: Dict, 
                             voice_config: Dict) -> Path:
        """单条文本语音生成"""
        communicate = Communicate(
            text=text_item['content'],
            voice=voice_config['voice'],
            rate=voice_config.get('rate', '+0%'),
            volume=voice_config.get('volume', '+0%'),
            pitch=voice_config.get('pitch', '+0Hz')
        )
        
        output_path = self.output_dir / f"{text_item['id']}.mp3"
        subtitle_path = self.output_dir / f"{text_item['id']}.srt"
        
        await communicate.save(str(output_path), str(subtitle_path))
        return output_path

并发处理优化策略

mermaid

实战:万级语音数据集创建

环境配置与安装

# 创建虚拟环境
python -m venv tts-env
source tts-env/bin/activate

# 安装edge-tts
pip install edge-tts

# 安装额外依赖
pip install aiohttp asyncio pandas tqdm

完整批量生成脚本

#!/usr/bin/env python3
"""
大规模语音数据集批量生成脚本
支持万级别文本的自动化语音合成
"""

import asyncio
import json
import csv
from pathlib import Path
from typing import List, Dict, Any
import aiohttp
from tqdm import tqdm
import edge_tts
from edge_tts import Communicate, VoicesManager

class MassTTSDatasetCreator:
    """大规模TTS数据集创建器"""
    
    def __init__(self, output_base: str = "./tts_dataset"):
        self.output_base = Path(output_base)
        self.output_base.mkdir(exist_ok=True)
        
        # 创建子目录
        self.audio_dir = self.output_base / "audio"
        self.subtitle_dir = self.output_base / "subtitles" 
        self.metadata_dir = self.output_base / "metadata"
        
        for dir_path in [self.audio_dir, self.subtitle_dir, self.metadata_dir]:
            dir_path.mkdir(exist_ok=True)
    
    async def get_available_voices(self) -> List[Dict]:
        """获取所有可用语音列表"""
        voices = await edge_tts.list_voices()
        return voices
    
    def load_text_corpus(self, corpus_path: str) -> List[Dict]:
        """加载文本语料库"""
        corpus_path = Path(corpus_path)
        
        if corpus_path.suffix == '.json':
            with open(corpus_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        elif corpus_path.suffix == '.csv':
            texts = []
            with open(corpus_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    texts.append(row)
            return texts
        elif corpus_path.suffix == '.txt':
            with open(corpus_path, 'r', encoding='utf-8') as f:
                return [{"id": f"text_{i}", "content": line.strip()} 
                       for i, line in enumerate(f.readlines()) if line.strip()]
        else:
            raise ValueError("Unsupported corpus format")
    
    async def generate_dataset(self, corpus_path: str, 
                             voice_configs: List[Dict],
                             batch_size: int = 100,
                             max_concurrent: int = 10) -> Dict[str, Any]:
        """
        生成完整语音数据集
        
        Args:
            corpus_path: 文本语料库路径
            voice_configs: 语音配置列表
            batch_size: 批处理大小
            max_concurrent: 最大并发数
            
        Returns:
            生成统计信息
        """
        # 加载文本语料
        texts = self.load_text_corpus(corpus_path)
        total_texts = len(texts)
        
        print(f"加载 {total_texts} 条文本,使用 {len(voice_configs)} 种语音配置")
        
        # 信号量控制并发
        semaphore = asyncio.Semaphore(max_concurrent)
        
        results = {
            "total_tasks": total_texts * len(voice_configs),
            "successful": 0,
            "failed": 0,
            "generated_files": 0
        }
        
        # 创建进度条
        pbar = tqdm(total=results["total_tasks"], 
                   desc="生成语音数据", unit="task")
        
        async def process_item(text_item: Dict, voice_config: Dict):
            """处理单个文本语音生成任务"""
            async with semaphore:
                try:
                    text_id = text_item['id']
                    voice_name = voice_config['voice']
                    
                    # 生成唯一文件名
                    base_name = f"{text_id}_{voice_name}"
                    audio_file = self.audio_dir / f"{base_name}.mp3"
                    subtitle_file = self.subtitle_dir / f"{base_name}.srt"
                    
                    # 创建TTS实例
                    communicate = Communicate(
                        text=text_item['content'],
                        voice=voice_name,
                        rate=voice_config.get('rate', '+0%'),
                        volume=voice_config.get('volume', '+0%'),
                        pitch=voice_config.get('pitch', '+0Hz')
                    )
                    
                    # 生成语音和字幕
                    await communicate.save(
                        str(audio_file), 
                        str(subtitle_file)
                    )
                    
                    # 更新统计
                    results["successful"] += 1
                    results["generated_files"] += 2
                    
                except Exception as e:
                    results["failed"] += 1
                    print(f"处理失败: {text_item['id']} - {voice_name}: {str(e)}")
                finally:
                    pbar.update(1)
        
        # 创建所有任务
        tasks = []
        for text_item in texts:
            for voice_config in voice_configs:
                task = process_item(text_item, voice_config)
                tasks.append(task)
        
        # 并发执行
        await asyncio.gather(*tasks)
        
        pbar.close()
        return results
    
    def create_dataset_manifest(self):
        """创建数据集清单文件"""
        manifest = {
            "dataset_info": {
                "name": "Edge-TTS Generated Dataset",
                "version": "1.0",
                "created_date": "2025-08-29",
                "total_samples": 0,
                "languages": [],
                "voices": []
            },
            "samples": []
        }
        
        # 收集音频文件信息
        audio_files = list(self.audio_dir.glob("*.mp3"))
        for audio_file in audio_files:
            # 解析文件名获取信息
            parts = audio_file.stem.split('_')
            if len(parts) >= 2:
                text_id = parts[0]
                voice_name = '_'.join(parts[1:])
                
                subtitle_file = self.subtitle_dir / f"{audio_file.stem}.srt"
                
                manifest["samples"].append({
                    "text_id": text_id,
                    "voice": voice_name,
                    "audio_path": str(audio_file.relative_to(self.output_base)),
                    "subtitle_path": str(subtitle_file.relative_to(self.output_base)) if subtitle_file.exists() else None
                })
        
        manifest["dataset_info"]["total_samples"] = len(manifest["samples"])
        manifest["dataset_info"]["voices"] = list(set(
            sample["voice"] for sample in manifest["samples"]
        ))
        
        # 保存清单文件
        manifest_file = self.metadata_dir / "dataset_manifest.json"
        with open(manifest_file, 'w', encoding='utf-8') as f:
            json.dump(manifest, f, ensure_ascii=False, indent=2)
        
        return manifest

async def main():
    """主函数:演示万级语音数据集生成"""
    
    # 初始化生成器
    creator = MassTTSDatasetCreator("./large_tts_dataset")
    
    # 定义语音配置(示例)
    voice_configs = [
        {"voice": "en-US-AriaNeural", "rate": "+0%", "volume": "+0%"},
        {"voice": "en-GB-SoniaNeural", "rate": "+10%", "volume": "-5%"},
        {"voice": "zh-CN-XiaoxiaoNeural", "rate": "+0%", "volume": "+0%"},
        {"voice": "ja-JP-NanamiNeural", "rate": "-5%", "volume": "+0%"}
    ]
    
    # 生成示例文本语料库
    sample_texts = [
        {"id": "sample_1", "content": "Hello, this is a test sentence for TTS generation."},
        {"id": "sample_2", "content": "语音合成技术正在改变人机交互的方式。"},
        {"id": "sample_3", "content": "人工智能は現代社会に革命をもたらしています。"}
    ]
    
    # 保存示例语料库
    with open("sample_corpus.json", 'w', encoding='utf-8') as f:
        json.dump(sample_texts, f, ensure_ascii=False, indent=2)
    
    # 生成数据集
    results = await creator.generate_dataset(
        "sample_corpus.json",
        voice_configs,
        max_concurrent=5
    )
    
    print(f"\n生成完成!")
    print(f"成功: {results['successful']}")
    print(f"失败: {results['failed']}")
    print(f"生成文件: {results['generated_files']}")
    
    # 创建数据集清单
    manifest = creator.create_dataset_manifest()
    print(f"数据集包含 {manifest['dataset_info']['total_samples']} 个样本")
    print(f"支持语音: {', '.join(manifest['dataset_info']['voices'])}")

if __name__ == "__main__":
    asyncio.run(main())

性能优化与错误处理

mermaid

质量保障与验证体系

自动化质量检测

【免费下载链接】edge-tts Use Microsoft Edge's online text-to-speech service from Python WITHOUT needing Microsoft Edge or Windows or an API key 【免费下载链接】edge-tts 项目地址: https://gitcode.com/GitHub_Trending/ed/edge-tts

Logo

火山引擎开发者社区是火山引擎打造的AI技术生态平台,聚焦Agent与大模型开发,提供豆包系列模型(图像/视频/视觉)、智能分析与会话工具,并配套评测集、动手实验室及行业案例库。社区通过技术沙龙、挑战赛等活动促进开发者成长,新用户可领50万Tokens权益,助力构建智能应用。

更多推荐