diff --git a/.env.example b/.env.example index 9ab9793..8ccad9e 100644 --- a/.env.example +++ b/.env.example @@ -1 +1 @@ -FISH_AUDIO_API_KEY= \ No newline at end of file +FISH_API_KEY= \ No newline at end of file diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 619f999..f6b6ca4 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -72,7 +72,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.x" + python-version: "3.9" - name: Install uv uses: astral-sh/setup-uv@v4 @@ -83,7 +83,7 @@ jobs: - name: Run integration tests run: uv run pytest tests/integration/ -v env: - FISH_AUDIO_API_KEY: ${{ secrets.FISH_AUDIO_API_KEY }} + FISH_API_KEY: ${{ secrets.FISH_API_KEY }} - name: Upload Test Artifacts uses: actions/upload-artifact@v4 diff --git a/README.md b/README.md index 0923942..1bb11a7 100644 --- a/README.md +++ b/README.md @@ -1,205 +1,250 @@ # Fish Audio Python SDK -To provide convenient Python program integration for https://docs.fish.audio. +[![PyPI version](https://img.shields.io/pypi/v/fish-audio-sdk.svg)](https://badge.fury.io/py/fish-audio-sdk) +[![Python Version](https://img.shields.io/badge/python-3.9+-blue)](https://pypi.org/project/fish-audio-sdk/) +[![PyPI - Downloads](https://img.shields.io/pypi/dm/fish-audio-sdk)](https://pypi.org/project/fish-audio-sdk/) +[![codecov](https://img.shields.io/codecov/c/github/fishaudio/fish-audio-python)](https://codecov.io/gh/fishaudio/fish-audio-python) +[![License](https://img.shields.io/github/license/fishaudio/fish-audio-python)](https://github.com/fishaudio/fish-audio-python/blob/main/LICENSE) -## Install +The official Python library for the Fish Audio API -```bash -pip install fish-audio-sdk -``` -> [!NOTE] -> The new release has not officially been released yet - please see legacy SDK documentation for now. +**Documentation:** [Python SDK Guide](https://docs.fish.audio/developer-guide/sdk-guide/python/) | [API Reference](https://docs.fish.audio/api-reference/sdk/python/) -## Usage +> **Note:** If you're using the legacy `fish_audio_sdk` API, see the [migration guide](https://docs.fish.audio/archive/python-sdk-legacy/migration-guide) to upgrade. -### New SDK (Recommended) +## Installation -The new SDK uses the `fishaudio` module: - -```python -from fishaudio import FishAudio +```bash +pip install fish-audio-sdk -client = FishAudio(api_key="your_api_key") +# With audio playback utilities +pip install fish-audio-sdk[utils] ``` -You can customize the base URL: +## Authentication -```python -from fishaudio import FishAudio +Get your API key from [fish.audio/app/api-keys](https://fish.audio/app/api-keys): -client = FishAudio(api_key="your_api_key", base_url="https://your-proxy-domain") +```bash +export FISH_API_KEY=your_api_key_here ``` -### Legacy SDK - -The legacy SDK uses the `fish_audio_sdk` module. Initialize a `Session` to use APIs. All APIs have synchronous and asynchronous versions. If you want to use the asynchronous version of the API, you only need to rewrite the original `session.api_call(...)` to `session.api_call.awaitable(...)`. +Or provide directly: ```python -from fish_audio_sdk import Session +from fishaudio import FishAudio -session = Session("your_api_key") +client = FishAudio(api_key="your_api_key") ``` -Sometimes, you may need to change our endpoint to another address. You can use +## Quick Start -```python -from fish_audio_sdk import Session - -session = Session("your_api_key", base_url="https://your-proxy-domain") -``` - -### Text to speech +**Synchronous:** ```python -from fish_audio_sdk import Session, TTSRequest +from fishaudio import FishAudio +from fishaudio.utils import play, save + +client = FishAudio() -session = Session("your_api_key") +# Generate audio +audio = client.tts.convert(text="Hello, world!") -with open("r.mp3", "wb") as f: - for chunk in session.tts(TTSRequest(text="Hello, world!")): - f.write(chunk) +# Play or save +play(audio) +save(audio, "output.mp3") ``` -Or use async version: +**Asynchronous:** ```python import asyncio -import aiofiles - -from fish_audio_sdk import Session, TTSRequest - -session = Session("your_api_key") - +from fishaudio import AsyncFishAudio +from fishaudio.utils import play, save async def main(): - async with aiofiles.open("r.mp3", "wb") as f: - async for chunk in session.tts.awaitable( - TTSRequest(text="Hello, world!"), - ): - await f.write(chunk) - + client = AsyncFishAudio() + audio = await client.tts.convert(text="Hello, world!") + play(audio) + save(audio, "output.mp3") asyncio.run(main()) ``` -#### Reference Audio +## Core Features -```python -from fish_audio_sdk import TTSRequest +### Text-to-Speech + +**With custom voice:** -TTSRequest( - text="Hello, world!", - reference_id="your_model_id", +```python +# Use a specific voice by ID +audio = client.tts.convert( + text="Custom voice", + reference_id="802e3bc2b27e49c2995d23ef70e6ac89" ) ``` -Or just use `ReferenceAudio` in `TTSRequest`: +**With speed control:** ```python -from fish_audio_sdk import TTSRequest, ReferenceAudio - -TTSRequest( - text="Hello, world!", - references=[ - ReferenceAudio( - audio=audio_file.read(), - text="reference audio text", - ) - ], +audio = client.tts.convert( + text="Speaking faster!", + speed=1.5 # 1.5x speed ) ``` -### List models +**Reusable configuration:** ```python -models = session.list_models() -print(models) +from fishaudio.types import TTSConfig, Prosody + +config = TTSConfig( + prosody=Prosody(speed=1.2, volume=-5), + reference_id="933563129e564b19a115bedd57b7406a", + format="wav", + latency="balanced" +) + +# Reuse across generations +audio1 = client.tts.convert(text="First message", config=config) +audio2 = client.tts.convert(text="Second message", config=config) ``` -Or use async version: +**Chunk-by-chunk processing:** ```python -import asyncio +# Stream and process chunks as they arrive +for chunk in client.tts.stream(text="Long content..."): + send_to_websocket(chunk) +# Or collect all chunks +audio = client.tts.stream(text="Hello!").collect() +``` -async def main(): - models = await session.list_models.awaitable() - print(models) +[Learn more](https://docs.fish.audio/developer-guide/sdk-guide/python/text-to-speech) +### Speech-to-Text -asyncio.run(main()) +```python +# Transcribe audio +with open("audio.wav", "rb") as f: + result = client.asr.transcribe(audio=f.read(), language="en") + +print(result.text) + +# Access timestamped segments +for segment in result.segments: + print(f"[{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}") ``` +[Learn more](https://docs.fish.audio/developer-guide/sdk-guide/python/speech-to-text) + +### Real-time Streaming +Stream dynamically generated text for conversational AI and live applications: -### Get a model info by id +**Synchronous:** ```python -model = session.get_model("your_model_id") -print(model) +def text_chunks(): + yield "Hello, " + yield "this is " + yield "streaming!" + +audio_stream = client.tts.stream_websocket(text_chunks(), latency="balanced") +play(audio_stream) ``` -Or use async version: +**Asynchronous:** ```python -import asyncio +async def text_chunks(): + yield "Hello, " + yield "this is " + yield "streaming!" +audio_stream = await client.tts.stream_websocket(text_chunks(), latency="balanced") +play(audio_stream) +``` -async def main(): - model = await session.get_model.awaitable("your_model_id") - print(model) - +[Learn more](https://docs.fish.audio/developer-guide/sdk-guide/python/websocket) -asyncio.run(main()) -``` +### Voice Cloning -### Create a model +**Instant cloning:** ```python -model = session.create_model( - title="test", - description="test", - voices=[voice_file.read(), other_voice_file.read()], - cover_image=image_file.read(), -) -print(model) +from fishaudio.types import ReferenceAudio + +# Clone voice on-the-fly +with open("reference.wav", "rb") as f: + audio = client.tts.convert( + text="Cloned voice speaking", + references=[ReferenceAudio( + audio=f.read(), + text="Text spoken in reference" + )] + ) ``` -Or use async version: +**Persistent voice models:** ```python -import asyncio - - -async def main(): - model = await session.create_model.awaitable( - title="test", - description="test", - voices=[voice_file.read(), other_voice_file.read()], - cover_image=image_file.read(), +# Create voice model for reuse +with open("voice_sample.wav", "rb") as f: + voice = client.voices.create( + title="My Voice", + voices=[f.read()], + description="Custom voice clone" ) - print(model) - -asyncio.run(main()) +# Use the created model +audio = client.tts.convert( + text="Using my saved voice", + reference_id=voice.id +) ``` +[Learn more](https://docs.fish.audio/developer-guide/sdk-guide/python/voice-cloning) -### Delete a model +## Resource Clients -```python -session.delete_model("your_model_id") -``` +| Resource | Description | Key Methods | +|----------|-------------|-------------| +| `client.tts` | Text-to-speech | `convert()`, `stream()`, `stream_websocket()` | +| `client.asr` | Speech recognition | `transcribe()` | +| `client.voices` | Voice management | `list()`, `get()`, `create()`, `update()`, `delete()` | +| `client.account` | Account info | `get_credits()`, `get_package()` | -Or use async version: +## Error Handling ```python -import asyncio +from fishaudio.exceptions import ( + AuthenticationError, + RateLimitError, + ValidationError, + FishAudioError +) + +try: + audio = client.tts.convert(text="Hello!") +except AuthenticationError: + print("Invalid API key") +except RateLimitError: + print("Rate limit exceeded") +except ValidationError as e: + print(f"Invalid request: {e}") +except FishAudioError as e: + print(f"API error: {e}") +``` +## Resources -async def main(): - await session.delete_model.awaitable("your_model_id") +- **Documentation:** [SDK Guide](https://docs.fish.audio/developer-guide/sdk-guide/python/) | [API Reference](https://docs.fish.audio/api-reference/sdk/python/) +- **Package:** [PyPI](https://pypi.org/project/fish-audio-sdk/) | [GitHub](https://github.com/fishaudio/fish-audio-python) +- **Legacy SDK:** [Documentation](https://docs.fish.audio/archive/python-sdk-legacy) | [Migration Guide](https://docs.fish.audio/archive/python-sdk-legacy/migration-guide) +## License -asyncio.run(main()) -``` +This project is licensed under the Apache-2.0 License - see the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/examples/README.md b/examples/README.md index cc7510d..1132cd1 100644 --- a/examples/README.md +++ b/examples/README.md @@ -5,5 +5,5 @@ Example scripts demonstrating how to use the Fish Audio Python SDK. ```bash # Install and setup pip install fishaudio -export FISH_AUDIO_API_KEY="your_api_key" +export FISH_API_KEY="your_api_key" ``` \ No newline at end of file diff --git a/examples/getting-started/01_simple_tts.py b/examples/getting-started/01_simple_tts.py index f312848..2574ce7 100644 --- a/examples/getting-started/01_simple_tts.py +++ b/examples/getting-started/01_simple_tts.py @@ -10,7 +10,7 @@ pip install fishaudio Environment Setup: - export FISH_AUDIO_API_KEY="your_api_key_here" + export FISH_API_KEY="your_api_key_here" # Or pass api_key directly to the client Expected Output: @@ -25,7 +25,7 @@ def main(): # Initialize the client with your API key - # Option 1: Use environment variable FISH_AUDIO_API_KEY + # Option 1: Use environment variable FISH_API_KEY # Option 2: Pass api_key directly: FishAudio(api_key="your_key") client = FishAudio() @@ -52,4 +52,4 @@ def main(): except Exception as e: print(f"Error: {e}") print("\nMake sure you have set your API key:") - print(" export FISH_AUDIO_API_KEY='your_api_key'") + print(" export FISH_API_KEY='your_api_key'") diff --git a/examples/getting-started/02_play_audio.py b/examples/getting-started/02_play_audio.py index 5b62750..34c8e29 100644 --- a/examples/getting-started/02_play_audio.py +++ b/examples/getting-started/02_play_audio.py @@ -19,7 +19,7 @@ # pip install sounddevice soundfile Environment Setup: - export FISH_AUDIO_API_KEY="your_api_key_here" + export FISH_API_KEY="your_api_key_here" Expected Output: - Plays the generated audio through your speakers @@ -98,7 +98,7 @@ def demo_playback_methods(): except Exception as e: print(f"Error: {e}") print("\nTroubleshooting:") - print("1. Make sure your API key is set: export FISH_AUDIO_API_KEY='your_key'") + print("1. Make sure your API key is set: export FISH_API_KEY='your_key'") print("2. Install ffmpeg for audio playback:") print(" - macOS: brew install ffmpeg") print(" - Ubuntu: sudo apt install ffmpeg") diff --git a/examples/getting-started/03_check_credits.py b/examples/getting-started/03_check_credits.py index 68fc721..87412ec 100644 --- a/examples/getting-started/03_check_credits.py +++ b/examples/getting-started/03_check_credits.py @@ -13,7 +13,7 @@ pip install fishaudio Environment Setup: - export FISH_AUDIO_API_KEY="your_api_key_here" + export FISH_API_KEY="your_api_key_here" Expected Output: - Displays account credit balance @@ -84,7 +84,7 @@ def check_api_setup(): print(f" Error: {e}") print("\nPlease check:") print(" 1. Your API key is correct") - print(" 2. Environment variable is set: export FISH_AUDIO_API_KEY='your_key'") + print(" 2. Environment variable is set: export FISH_API_KEY='your_key'") print(" 3. You have an active internet connection") return False @@ -100,6 +100,6 @@ def check_api_setup(): except Exception as e: print(f"\nError: {e}") print("\nMake sure you have set your API key:") - print(" export FISH_AUDIO_API_KEY='your_api_key'") + print(" export FISH_API_KEY='your_api_key'") print("\nOr pass it directly when creating the client:") print(" client = FishAudio(api_key='your_api_key')") diff --git a/pyproject.toml b/pyproject.toml index 8aa9dfb..49c8d9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ readme = "README.md" license = {text = "Apache-2.0"} keywords = ["fish-audio", "tts", "text-to-speech", "voice-cloning", "ai", "speech-synthesis"] classifiers = [ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", diff --git a/scripts/copy_docs.py b/scripts/copy_docs.py index a2587eb..ea82907 100644 --- a/scripts/copy_docs.py +++ b/scripts/copy_docs.py @@ -142,7 +142,7 @@ def copy_docs(sdk_root: Path, docs_root: Path) -> None: python_sdk_dir, lambda content: add_frontmatter( content, - title="Python SDK", + title="Overview", description="Fish Audio Python SDK for text-to-speech and voice cloning", icon="python", ), diff --git a/src/fishaudio/__init__.py b/src/fishaudio/__init__.py index bf33f15..dcedf83 100644 --- a/src/fishaudio/__init__.py +++ b/src/fishaudio/__init__.py @@ -28,6 +28,7 @@ from ._version import __version__ from .client import AsyncFishAudio, FishAudio +from .core.iterators import AsyncAudioStream, AudioStream from .exceptions import ( APIError, AuthenticationError, @@ -52,6 +53,9 @@ "play", "save", "stream", + # Audio streams + "AudioStream", + "AsyncAudioStream", # Types "FlushEvent", "TextEvent", diff --git a/src/fishaudio/client.py b/src/fishaudio/client.py index 5a914cf..53be1ec 100644 --- a/src/fishaudio/client.py +++ b/src/fishaudio/client.py @@ -51,7 +51,7 @@ def __init__( Initialize Fish Audio client. Args: - api_key: API key (can also use FISH_AUDIO_API_KEY env var) + api_key: API key (can also use FISH_API_KEY env var) base_url: API base URL timeout: Request timeout in seconds httpx_client: Optional custom HTTP client @@ -145,7 +145,7 @@ def __init__( Initialize async Fish Audio client. Args: - api_key: API key (can also use FISH_AUDIO_API_KEY env var) + api_key: API key (can also use FISH_API_KEY env var) base_url: API base URL timeout: Request timeout in seconds httpx_client: Optional custom async HTTP client diff --git a/src/fishaudio/core/client_wrapper.py b/src/fishaudio/core/client_wrapper.py index 2173f28..f1232f7 100644 --- a/src/fishaudio/core/client_wrapper.py +++ b/src/fishaudio/core/client_wrapper.py @@ -53,10 +53,10 @@ def __init__( api_key: Optional[str] = None, base_url: str = "https://api.fish.audio", ): - self.api_key = api_key or os.getenv("FISH_AUDIO_API_KEY") + self.api_key = api_key or os.getenv("FISH_API_KEY") if not self.api_key: raise ValueError( - "API key must be provided either as argument or via FISH_AUDIO_API_KEY environment variable" + "API key must be provided either as argument or via FISH_API_KEY environment variable" ) self.base_url = base_url diff --git a/src/fishaudio/core/iterators.py b/src/fishaudio/core/iterators.py new file mode 100644 index 0000000..fbd5df8 --- /dev/null +++ b/src/fishaudio/core/iterators.py @@ -0,0 +1,115 @@ +"""Audio stream wrappers with collection utilities.""" + +from typing import AsyncIterator, Iterator + + +class AudioStream: + """Wrapper for sync audio byte streams with collection utilities. + + This class wraps an iterator of audio bytes and provides a convenient + `.collect()` method to gather all chunks into a single bytes object. + + Examples: + ```python + from fishaudio import FishAudio + + client = FishAudio(api_key="...") + + # Collect all audio at once + audio = client.tts.stream(text="Hello!").collect() + + # Or stream chunks manually + for chunk in client.tts.stream(text="Hello!"): + process_chunk(chunk) + ``` + """ + + def __init__(self, iterator: Iterator[bytes]): + """Initialize the audio iterator wrapper. + + Args: + iterator: The underlying iterator of audio bytes + """ + self._iter = iterator + + def __iter__(self) -> Iterator[bytes]: + """Allow direct iteration over audio chunks.""" + return self._iter + + def collect(self) -> bytes: + """Collect all audio chunks into a single bytes object. + + This consumes the iterator and returns all audio data as bytes. + After calling this method, the iterator cannot be used again. + + Returns: + Complete audio data as bytes + + Examples: + ```python + audio = client.tts.stream(text="Hello!").collect() + with open("output.mp3", "wb") as f: + f.write(audio) + ``` + """ + chunks = [] + for chunk in self._iter: + chunks.append(chunk) + return b"".join(chunks) + + +class AsyncAudioStream: + """Wrapper for async audio byte streams with collection utilities. + + This class wraps an async iterator of audio bytes and provides a convenient + `.collect()` method to gather all chunks into a single bytes object. + + Examples: + ```python + from fishaudio import AsyncFishAudio + + client = AsyncFishAudio(api_key="...") + + # Collect all audio at once + stream = await client.tts.stream(text="Hello!") + audio = await stream.collect() + + # Or stream chunks manually + async for chunk in await client.tts.stream(text="Hello!"): + await process_chunk(chunk) + ``` + """ + + def __init__(self, async_iterator: AsyncIterator[bytes]): + """Initialize the async audio iterator wrapper. + + Args: + async_iterator: The underlying async iterator of audio bytes + """ + self._iter = async_iterator + + def __aiter__(self) -> AsyncIterator[bytes]: + """Allow direct async iteration over audio chunks.""" + return self._iter + + async def collect(self) -> bytes: + """Collect all audio chunks into a single bytes object. + + This consumes the async iterator and returns all audio data as bytes. + After calling this method, the iterator cannot be used again. + + Returns: + Complete audio data as bytes + + Examples: + ```python + stream = await client.tts.stream(text="Hello!") + audio = await stream.collect() + with open("output.mp3", "wb") as f: + f.write(audio) + ``` + """ + chunks = [] + async for chunk in self._iter: + chunks.append(chunk) + return b"".join(chunks) diff --git a/src/fishaudio/resources/tts.py b/src/fishaudio/resources/tts.py index fef1cd4..bd3ceec 100644 --- a/src/fishaudio/resources/tts.py +++ b/src/fishaudio/resources/tts.py @@ -9,6 +9,7 @@ from .realtime import aiter_websocket_audio, iter_websocket_audio from ..core import AsyncClientWrapper, ClientWrapper, RequestOptions +from ..core.iterators import AsyncAudioStream, AudioStream from ..types import ( AudioFormat, CloseEvent, @@ -58,7 +59,7 @@ class TTSClient: def __init__(self, client_wrapper: ClientWrapper): self._client = client_wrapper - def convert( + def stream( self, *, text: str, @@ -70,9 +71,9 @@ def convert( config: TTSConfig = TTSConfig(), model: Model = "s1", request_options: Optional[RequestOptions] = None, - ) -> Iterator[bytes]: + ) -> AudioStream: """ - Convert text to speech. + Stream text-to-speech audio chunks. Args: text: Text to synthesize @@ -86,48 +87,20 @@ def convert( request_options: Request-level overrides Returns: - Iterator of audio bytes + AudioStream object that can be iterated for audio chunks Example: ```python - from fishaudio import FishAudio, TTSConfig, ReferenceAudio + from fishaudio import FishAudio client = FishAudio(api_key="...") - # Simple usage with defaults - audio = client.tts.convert(text="Hello world") - - # With format parameter - audio = client.tts.convert(text="Hello world", format="wav") - - # With speed parameter - audio = client.tts.convert(text="Hello world", speed=1.5) - - # With reference_id parameter - audio = client.tts.convert(text="Hello world", reference_id="your_model_id") - - # With references parameter - audio = client.tts.convert( - text="Hello world", - references=[ReferenceAudio(audio=audio_bytes, text="sample")] - ) - - # Combine multiple parameters - audio = client.tts.convert( - text="Hello world", - format="wav", - speed=1.2, - latency="normal" - ) - - # Parameters override config values - config = TTSConfig(format="mp3", prosody=Prosody(speed=1.0)) - audio = client.tts.convert(text="Hello world", format="wav", config=config) - # Result: format="wav" (parameter wins) + # Stream and process chunks + for chunk in client.tts.stream(text="Hello world"): + process_audio_chunk(chunk) - with open("output.mp3", "wb") as f: - for chunk in audio: - f.write(chunk) + # Or collect all at once + audio = client.tts.stream(text="Hello world").collect() ``` """ # Build request payload from config @@ -160,10 +133,75 @@ def convert( request_options=request_options, ) - # Stream response chunks - for chunk in response.iter_bytes(): - if chunk: - yield chunk + # Create generator and wrap with AudioStream + def _stream(): + for chunk in response.iter_bytes(): + if chunk: + yield chunk + + return AudioStream(_stream()) + + def convert( + self, + *, + text: str, + reference_id: Optional[str] = None, + references: Optional[List[ReferenceAudio]] = None, + format: Optional[AudioFormat] = None, + latency: Optional[LatencyMode] = None, + speed: Optional[float] = None, + config: TTSConfig = TTSConfig(), + model: Model = "s1", + request_options: Optional[RequestOptions] = None, + ) -> bytes: + """ + Convert text to speech and return complete audio as bytes. + + This is a convenience method that streams all audio chunks and combines them. + For chunk-by-chunk processing, use stream() instead. + + Args: + text: Text to synthesize + reference_id: Voice reference ID (overrides config.reference_id if provided) + references: Reference audio samples (overrides config.references if provided) + format: Audio format - "mp3", "wav", "pcm", or "opus" (overrides config.format if provided) + latency: Latency mode - "normal" or "balanced" (overrides config.latency if provided) + speed: Speech speed multiplier, e.g. 1.5 for 1.5x speed (overrides config.prosody.speed if provided) + config: TTS configuration (audio settings, voice, model parameters) + model: TTS model to use + request_options: Request-level overrides + + Returns: + Complete audio as bytes + + Example: + ```python + from fishaudio import FishAudio + from fishaudio.utils import play, save + + client = FishAudio(api_key="...") + + # Get complete audio + audio = client.tts.convert(text="Hello world") + + # Play it + play(audio) + + # Or save it + save(audio, "output.mp3") + ``` + """ + return self.stream( + text=text, + reference_id=reference_id, + references=references, + format=format, + latency=latency, + speed=speed, + config=config, + model=model, + request_options=request_options, + ).collect() def stream_websocket( self, @@ -307,7 +345,7 @@ class AsyncTTSClient: def __init__(self, client_wrapper: AsyncClientWrapper): self._client = client_wrapper - async def convert( + async def stream( self, *, text: str, @@ -319,9 +357,9 @@ async def convert( config: TTSConfig = TTSConfig(), model: Model = "s1", request_options: Optional[RequestOptions] = None, - ): + ) -> AsyncAudioStream: """ - Convert text to speech (async). + Stream text-to-speech audio chunks (async). Args: text: Text to synthesize @@ -335,48 +373,21 @@ async def convert( request_options: Request-level overrides Returns: - Async iterator of audio bytes + AsyncAudioStream object that can be iterated for audio chunks Example: ```python - from fishaudio import AsyncFishAudio, TTSConfig, ReferenceAudio + from fishaudio import AsyncFishAudio client = AsyncFishAudio(api_key="...") - # Simple usage with defaults - audio = await client.tts.convert(text="Hello world") - - # With format parameter - audio = await client.tts.convert(text="Hello world", format="wav") - - # With speed parameter - audio = await client.tts.convert(text="Hello world", speed=1.5) - - # With reference_id parameter - audio = await client.tts.convert(text="Hello world", reference_id="your_model_id") - - # With references parameter - audio = await client.tts.convert( - text="Hello world", - references=[ReferenceAudio(audio=audio_bytes, text="sample")] - ) - - # Combine multiple parameters - audio = await client.tts.convert( - text="Hello world", - format="wav", - speed=1.2, - latency="normal" - ) - - # Parameters override config values - config = TTSConfig(format="mp3", prosody=Prosody(speed=1.0)) - audio = await client.tts.convert(text="Hello world", format="wav", config=config) - # Result: format="wav" (parameter wins) + # Stream and process chunks + async for chunk in await client.tts.stream(text="Hello world"): + await process_audio_chunk(chunk) - async with aiofiles.open("output.mp3", "wb") as f: - async for chunk in audio: - await f.write(chunk) + # Or collect all at once + stream = await client.tts.stream(text="Hello world") + audio = await stream.collect() ``` """ # Build request payload from config @@ -409,10 +420,76 @@ async def convert( request_options=request_options, ) - # Stream response chunks - async for chunk in response.aiter_bytes(): - if chunk: - yield chunk + # Create async generator and wrap with AsyncAudioStream + async def _stream(): + async for chunk in response.aiter_bytes(): + if chunk: + yield chunk + + return AsyncAudioStream(_stream()) + + async def convert( + self, + *, + text: str, + reference_id: Optional[str] = None, + references: Optional[List[ReferenceAudio]] = None, + format: Optional[AudioFormat] = None, + latency: Optional[LatencyMode] = None, + speed: Optional[float] = None, + config: TTSConfig = TTSConfig(), + model: Model = "s1", + request_options: Optional[RequestOptions] = None, + ) -> bytes: + """ + Convert text to speech and return complete audio as bytes (async). + + This is a convenience method that streams all audio chunks and combines them. + For chunk-by-chunk processing, use stream() instead. + + Args: + text: Text to synthesize + reference_id: Voice reference ID (overrides config.reference_id if provided) + references: Reference audio samples (overrides config.references if provided) + format: Audio format - "mp3", "wav", "pcm", or "opus" (overrides config.format if provided) + latency: Latency mode - "normal" or "balanced" (overrides config.latency if provided) + speed: Speech speed multiplier, e.g. 1.5 for 1.5x speed (overrides config.prosody.speed if provided) + config: TTS configuration (audio settings, voice, model parameters) + model: TTS model to use + request_options: Request-level overrides + + Returns: + Complete audio as bytes + + Example: + ```python + from fishaudio import AsyncFishAudio + from fishaudio.utils import play, save + + client = AsyncFishAudio(api_key="...") + + # Get complete audio + audio = await client.tts.convert(text="Hello world") + + # Play it + play(audio) + + # Or save it + save(audio, "output.mp3") + ``` + """ + stream = await self.stream( + text=text, + reference_id=reference_id, + references=references, + format=format, + latency=latency, + speed=speed, + config=config, + model=model, + request_options=request_options, + ) + return await stream.collect() async def stream_websocket( self, diff --git a/src/fishaudio/types/account.py b/src/fishaudio/types/account.py index 2803383..966ade5 100644 --- a/src/fishaudio/types/account.py +++ b/src/fishaudio/types/account.py @@ -7,7 +7,17 @@ class Credits(BaseModel): - """User's API credit balance.""" + """User's API credit balance. + + Attributes: + id: Unique credits record identifier + user_id: User identifier + credit: Current credit balance (decimal for precise accounting) + created_at: Timestamp when the credits record was created + updated_at: Timestamp when the credits were last updated + has_phone_sha256: Whether the user has a verified phone number. Optional + has_free_credit: Whether the user has received free credits. Optional + """ model_config = ConfigDict(populate_by_name=True) @@ -21,7 +31,18 @@ class Credits(BaseModel): class Package(BaseModel): - """User's prepaid package information.""" + """User's prepaid package information. + + Attributes: + id: Unique package identifier + user_id: User identifier + type: Package type identifier + total: Total units in the package + balance: Remaining units in the package + created_at: Timestamp when the package was purchased + updated_at: Timestamp when the package was last updated + finished_at: Timestamp when the package was fully consumed. None if still active + """ model_config = ConfigDict(populate_by_name=True) diff --git a/src/fishaudio/types/asr.py b/src/fishaudio/types/asr.py index 84d2dbb..db73916 100644 --- a/src/fishaudio/types/asr.py +++ b/src/fishaudio/types/asr.py @@ -6,7 +6,13 @@ class ASRSegment(BaseModel): - """A timestamped segment of transcribed text.""" + """A timestamped segment of transcribed text. + + Attributes: + text: The transcribed text for this segment + start: Segment start time in seconds + end: Segment end time in seconds + """ text: str start: float @@ -14,7 +20,13 @@ class ASRSegment(BaseModel): class ASRResponse(BaseModel): - """Response from speech-to-text transcription.""" + """Response from speech-to-text transcription. + + Attributes: + text: Complete transcription of the entire audio + duration: Total audio duration in milliseconds + segments: List of timestamped text segments. Empty if include_timestamps=False + """ text: str duration: float # Duration in milliseconds diff --git a/src/fishaudio/types/shared.py b/src/fishaudio/types/shared.py index df7ab4a..1e756d9 100644 --- a/src/fishaudio/types/shared.py +++ b/src/fishaudio/types/shared.py @@ -9,7 +9,12 @@ class PaginatedResponse(BaseModel, Generic[T]): - """Generic paginated response.""" + """Generic paginated response. + + Attributes: + total: Total number of items across all pages + items: List of items on the current page + """ total: int items: List[T] @@ -25,7 +30,7 @@ class PaginatedResponse(BaseModel, Generic[T]): Visibility = Literal["public", "unlist", "private"] # Training mode types -TrainMode = Literal["fast", "full"] +TrainMode = Literal["fast"] # Model state types ModelState = Literal["created", "training", "trained", "failed"] diff --git a/src/fishaudio/types/tts.py b/src/fishaudio/types/tts.py index 4dd7671..8b0923a 100644 --- a/src/fishaudio/types/tts.py +++ b/src/fishaudio/types/tts.py @@ -8,14 +8,27 @@ class ReferenceAudio(BaseModel): - """Reference audio for voice cloning/style.""" + """Reference audio for voice cloning/style. + + Attributes: + audio: Audio file bytes for the reference sample + text: Transcription of what is spoken in the reference audio. Should match exactly + what's spoken and include punctuation for proper prosody. + """ audio: bytes text: str class Prosody(BaseModel): - """Speech prosody settings (speed and volume).""" + """Speech prosody settings (speed and volume). + + Attributes: + speed: Speech speed multiplier. Range: 0.5-2.0. Default: 1.0. + Examples: 1.5 = 50% faster, 0.8 = 20% slower + volume: Volume adjustment in decibels. Range: -20.0 to 20.0. Default: 0.0 (no change). + Positive values increase volume, negative values decrease it. + """ speed: Annotated[float, Field(ge=0.5, le=2.0)] = 1.0 volume: Annotated[float, Field(ge=-20.0, le=20.0)] = 0.0 @@ -45,6 +58,23 @@ class TTSConfig(BaseModel): Reusable configuration for text-to-speech requests. Create once, use multiple times. All parameters have sensible defaults. + + Attributes: + format: Audio output format. Options: "mp3", "wav", "pcm", "opus". Default: "mp3" + sample_rate: Audio sample rate in Hz. If None, uses format-specific default. + mp3_bitrate: MP3 bitrate in kbps. Options: 64, 128, 192. Default: 128 + opus_bitrate: Opus bitrate in kbps. Options: -1000, 24, 32, 48, 64. Default: 32 + normalize: Whether to normalize/clean the input text. Default: True + chunk_length: Characters per generation chunk. Range: 100-300. Default: 200. + Lower values = faster initial response, higher values = better quality + latency: Generation mode. Options: "normal" (higher quality), "balanced" (faster). Default: "balanced" + reference_id: Voice model ID from fish.audio (e.g., "802e3bc2b27e49c2995d23ef70e6ac89"). + Find IDs in voice URLs or via voices.list() + references: List of reference audio samples for instant voice cloning. Default: [] + prosody: Speech speed and volume settings. Default: None (uses natural prosody) + top_p: Nucleus sampling parameter for token selection. Range: 0.0-1.0. Default: 0.7 + temperature: Randomness in generation. Range: 0.0-1.0. Default: 0.7. + Higher = more varied, lower = more consistent """ # Audio output settings @@ -74,6 +104,21 @@ class TTSRequest(BaseModel): This model is used internally for WebSocket streaming. For the HTTP API, parameters are passed directly to methods. + + Attributes: + text: Text to synthesize into speech + chunk_length: Characters per generation chunk. Range: 100-300. Default: 200 + format: Audio output format. Options: "mp3", "wav", "pcm", "opus". Default: "mp3" + sample_rate: Audio sample rate in Hz. If None, uses format-specific default + mp3_bitrate: MP3 bitrate in kbps. Options: 64, 128, 192. Default: 128 + opus_bitrate: Opus bitrate in kbps. Options: -1000, 24, 32, 48, 64. Default: 32 + references: List of reference audio samples for voice cloning. Default: [] + reference_id: Voice model ID for using a specific voice. Default: None + normalize: Whether to normalize/clean the input text. Default: True + latency: Generation mode. Options: "normal", "balanced". Default: "balanced" + prosody: Speech speed and volume settings. Default: None + top_p: Nucleus sampling for token selection. Range: 0.0-1.0. Default: 0.7 + temperature: Randomness in generation. Range: 0.0-1.0. Default: 0.7 """ text: str @@ -93,26 +138,46 @@ class TTSRequest(BaseModel): # WebSocket event types for streaming TTS class StartEvent(BaseModel): - """WebSocket start event.""" + """WebSocket start event to initiate TTS streaming. + + Attributes: + event: Event type identifier, always "start" + request: TTS configuration for the streaming session + """ event: Literal["start"] = "start" request: TTSRequest class TextEvent(BaseModel): - """WebSocket text chunk event.""" + """WebSocket event to send a text chunk for synthesis. + + Attributes: + event: Event type identifier, always "text" + text: Text chunk to synthesize + """ event: Literal["text"] = "text" text: str class FlushEvent(BaseModel): - """WebSocket flush event - forces buffer to generate audio immediately.""" + """WebSocket event to force immediate audio generation from buffered text. + + Use this to ensure all buffered text is synthesized without waiting for more input. + + Attributes: + event: Event type identifier, always "flush" + """ event: Literal["flush"] = "flush" class CloseEvent(BaseModel): - """WebSocket close event.""" + """WebSocket event to end the streaming session. + + Attributes: + event: Event type identifier, always "stop" + """ event: Literal["stop"] = "stop" diff --git a/src/fishaudio/types/voices.py b/src/fishaudio/types/voices.py index 90e41b2..04f7570 100644 --- a/src/fishaudio/types/voices.py +++ b/src/fishaudio/types/voices.py @@ -9,7 +9,14 @@ class Sample(BaseModel): - """A sample audio for a voice model.""" + """A sample audio for a voice model. + + Attributes: + title: Title/name of the audio sample + text: Transcription of the spoken content in the sample + task_id: Unique identifier for the sample task + audio: URL or path to the audio file + """ title: str text: str @@ -18,7 +25,13 @@ class Sample(BaseModel): class Author(BaseModel): - """Voice model author information.""" + """Voice model author information. + + Attributes: + id: Unique author identifier + nickname: Author's display name + avatar: URL to author's avatar image + """ id: str = Field(alias="_id") nickname: str @@ -27,9 +40,32 @@ class Author(BaseModel): class Voice(BaseModel): """ - A voice model + A voice model. Represents a TTS voice that can be used for synthesis. + + Attributes: + id: Unique voice model identifier (use as reference_id in TTS) + type: Model type. Options: "svc" (singing voice conversion), "tts" (text-to-speech) + title: Voice model title/name + description: Detailed description of the voice model + cover_image: URL to the voice model's cover image + train_mode: Training mode used. Options: "fast" + state: Current model state (e.g., "ready", "training", "failed") + tags: List of tags for categorization (e.g., ["male", "english", "young"]) + samples: List of audio samples demonstrating the voice + created_at: Timestamp when the model was created + updated_at: Timestamp when the model was last updated + languages: List of supported language codes (e.g., ["en", "zh"]) + visibility: Model visibility. Options: "public", "private", "unlisted" + lock_visibility: Whether visibility setting is locked + like_count: Number of likes the model has received + mark_count: Number of bookmarks/favorites + shared_count: Number of times the model has been shared + task_count: Number of times the model has been used for generation + liked: Whether the current user has liked this model. Default: False + marked: Whether the current user has bookmarked this model. Default: False + author: Information about the voice model's creator """ id: str = Field(alias="_id") diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 00ec2d8..2d43b32 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,5 +1,7 @@ """Fixtures for integration tests.""" +from __future__ import annotations + import os from pathlib import Path @@ -21,9 +23,9 @@ @pytest.fixture def api_key(): """Get API key from environment.""" - key = os.getenv("FISH_AUDIO_API_KEY") + key = os.getenv("FISH_API_KEY") if not key: - pytest.skip("No API key available (set FISH_AUDIO_API_KEY)") + pytest.skip("No API key available (set FISH_API_KEY)") return key @@ -55,17 +57,20 @@ def save_audio(): A callable that takes audio chunks and filename and saves to output/ """ - def _save(audio_chunks: list[bytes], filename: str) -> Path: - """Save audio chunks to output directory. + def _save(audio: bytes | list[bytes], filename: str) -> Path: + """Save audio to output directory. Args: - audio_chunks: List of audio byte chunks + audio: Audio bytes or list of audio byte chunks filename: Name of the output file (including extension) Returns: Path to the saved file """ - complete_audio = b"".join(audio_chunks) + if isinstance(audio, bytes): + complete_audio = audio + else: + complete_audio = b"".join(audio) output_file = OUTPUT_DIR / filename output_file.write_bytes(complete_audio) return output_file diff --git a/tests/integration/test_asr_integration.py b/tests/integration/test_asr_integration.py index 953f7c8..7e2602e 100644 --- a/tests/integration/test_asr_integration.py +++ b/tests/integration/test_asr_integration.py @@ -13,10 +13,7 @@ def sample_audio(self, client): """Generate sample audio for ASR testing.""" # Generate audio from known text config = TTSConfig(format="wav") - audio_chunks = list( - client.tts.convert(text="Hello world, this is a test.", config=config) - ) - return b"".join(audio_chunks) + return client.tts.convert(text="Hello world, this is a test.", config=config) def test_basic_asr(self, client, sample_audio): """Test basic speech-to-text transcription.""" @@ -54,13 +51,8 @@ class TestAsyncASRIntegration: @pytest.fixture async def async_sample_audio(self, async_client): """Generate sample audio for async ASR testing.""" - audio_chunks = [] config = TTSConfig(format="wav") - async for chunk in async_client.tts.convert( - text="Async test audio", config=config - ): - audio_chunks.append(chunk) - return b"".join(audio_chunks) + return await async_client.tts.convert(text="Async test audio", config=config) @pytest.mark.asyncio async def test_async_basic_asr(self, async_client, async_sample_audio): diff --git a/tests/integration/test_tts_integration.py b/tests/integration/test_tts_integration.py index 8d00d77..f6b4fc2 100644 --- a/tests/integration/test_tts_integration.py +++ b/tests/integration/test_tts_integration.py @@ -13,15 +13,13 @@ class TestTTSIntegration: def test_basic_tts(self, client, save_audio): """Test basic text-to-speech generation.""" - audio_chunks = list(client.tts.convert(text="Hello, this is a test.")) + audio = client.tts.convert(text="Hello, this is a test.") - assert len(audio_chunks) > 0 - # Verify we got audio data (check for common audio headers) - complete_audio = b"".join(audio_chunks) - assert len(complete_audio) > 1000 # Should have substantial audio data + assert len(audio) > 1000 # Should have substantial audio data + assert isinstance(audio, bytes) # Write to output directory - save_audio(audio_chunks, "test_basic_tts.mp3") + save_audio(audio, "test_basic_tts.mp3") def test_tts_with_different_formats(self, client, save_audio): """Test TTS with different audio formats.""" @@ -29,27 +27,23 @@ def test_tts_with_different_formats(self, client, save_audio): for fmt in formats: config = TTSConfig(format=fmt, chunk_length=100) - audio_chunks = list( - client.tts.convert(text=f"Testing format {fmt}", config=config) - ) - assert len(audio_chunks) > 0, f"Failed for format: {fmt}" + audio = client.tts.convert(text=f"Testing format {fmt}", config=config) + assert len(audio) > 0, f"Failed for format: {fmt}" # Write to output directory - save_audio(audio_chunks, f"test_format_{fmt}.{fmt}") + save_audio(audio, f"test_format_{fmt}.{fmt}") def test_tts_with_prosody(self, client, save_audio): """Test TTS with prosody settings.""" prosody = Prosody(speed=1.2, volume=0.5) config = TTSConfig(prosody=prosody) - audio_chunks = list( - client.tts.convert(text="Testing prosody settings", config=config) - ) + audio = client.tts.convert(text="Testing prosody settings", config=config) - assert len(audio_chunks) > 0 + assert len(audio) > 0 # Write to output directory - save_audio(audio_chunks, "test_prosody.mp3") + save_audio(audio, "test_prosody.mp3") def test_tts_with_different_models(self, client, save_audio): """Test TTS with different models.""" @@ -57,13 +51,11 @@ def test_tts_with_different_models(self, client, save_audio): for model in models: try: - audio_chunks = list( - client.tts.convert(text=f"Testing model {model}", model=model) - ) - assert len(audio_chunks) > 0, f"Failed for model: {model}" + audio = client.tts.convert(text=f"Testing model {model}", model=model) + assert len(audio) > 0, f"Failed for model: {model}" # Write to output directory - save_audio(audio_chunks, f"test_model_{model}.mp3") + save_audio(audio, f"test_model_{model}.mp3") except Exception as e: # Some models might not be available pytest.skip(f"Model {model} not available: {e}") @@ -73,23 +65,21 @@ def test_tts_longer_text(self, client, save_audio): long_text = "This is a longer piece of text for testing. " * 10 config = TTSConfig(chunk_length=200) - audio_chunks = list(client.tts.convert(text=long_text, config=config)) + audio = client.tts.convert(text=long_text, config=config) - assert len(audio_chunks) > 0 - complete_audio = b"".join(audio_chunks) # Longer text should produce more audio - assert len(complete_audio) > 5000 + assert len(audio) > 5000 # Write to output directory - save_audio(audio_chunks, "test_longer_text.mp3") + save_audio(audio, "test_longer_text.mp3") def test_tts_empty_text_should_fail(self, client): """Test that empty text is handled.""" # This might succeed with silence or fail - test behavior try: - audio_chunks = list(client.tts.convert(text="")) + audio = client.tts.convert(text="") # If it succeeds, verify we get something - assert len(audio_chunks) >= 0 + assert isinstance(audio, bytes) except Exception: # If it fails, that's also acceptable pass @@ -101,16 +91,13 @@ class TestAsyncTTSIntegration: @pytest.mark.asyncio async def test_basic_async_tts(self, async_client, save_audio): """Test basic async text-to-speech generation.""" - audio_chunks = [] - async for chunk in async_client.tts.convert(text="Hello from async"): - audio_chunks.append(chunk) + audio = await async_client.tts.convert(text="Hello from async") - assert len(audio_chunks) > 0 - complete_audio = b"".join(audio_chunks) - assert len(complete_audio) > 1000 + assert len(audio) > 1000 + assert isinstance(audio, bytes) # Write to output directory - save_audio(audio_chunks, "test_async_basic.mp3") + save_audio(audio, "test_async_basic.mp3") @pytest.mark.asyncio async def test_async_tts_with_prosody(self, async_client, save_audio): @@ -118,13 +105,9 @@ async def test_async_tts_with_prosody(self, async_client, save_audio): prosody = Prosody(speed=0.8, volume=-0.2) config = TTSConfig(prosody=prosody) - audio_chunks = [] - async for chunk in async_client.tts.convert( - text="Async prosody test", config=config - ): - audio_chunks.append(chunk) + audio = await async_client.tts.convert(text="Async prosody test", config=config) - assert len(audio_chunks) > 0 + assert len(audio) > 0 # Write to output directory - save_audio(audio_chunks, "test_async_prosody.mp3") + save_audio(audio, "test_async_prosody.mp3") diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index f1aa2c0..d288491 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -20,7 +20,7 @@ def test_init_with_api_key(self, mock_api_key): assert client._client_wrapper.api_key == mock_api_key def test_init_with_env_var(self, mock_api_key): - with patch.dict("os.environ", {"FISH_AUDIO_API_KEY": mock_api_key}): + with patch.dict("os.environ", {"FISH_API_KEY": mock_api_key}): client = FishAudio() assert client._client_wrapper.api_key == mock_api_key diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 76a3611..f77dc04 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -67,7 +67,7 @@ def test_init_without_api_key_raises(self): ClientWrapper() def test_init_with_env_var(self, mock_api_key): - with patch.dict("os.environ", {"FISH_AUDIO_API_KEY": mock_api_key}): + with patch.dict("os.environ", {"FISH_API_KEY": mock_api_key}): wrapper = ClientWrapper() assert wrapper.api_key == mock_api_key diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py index 6ddff60..47bfb06 100644 --- a/tests/unit/test_tts.py +++ b/tests/unit/test_tts.py @@ -40,15 +40,15 @@ def async_tts_client(async_mock_client_wrapper): class TestTTSClient: """Test synchronous TTSClient.""" - def test_convert_basic(self, tts_client, mock_client_wrapper): - """Test basic TTS conversion.""" + def test_stream_basic(self, tts_client, mock_client_wrapper): + """Test basic TTS streaming.""" # Setup mock response with audio chunks mock_response = Mock() mock_response.iter_bytes.return_value = iter([b"chunk1", b"chunk2", b"chunk3"]) mock_client_wrapper.request.return_value = mock_response - # Call convert - audio_chunks = list(tts_client.convert(text="Hello world")) + # Call stream + audio_chunks = list(tts_client.stream(text="Hello world")) # Verify we got chunks back assert audio_chunks == [b"chunk1", b"chunk2", b"chunk3"] @@ -67,6 +67,23 @@ def test_convert_basic(self, tts_client, mock_client_wrapper): # Check payload was msgpack encoded assert "content" in call_args[1] + def test_convert_basic(self, tts_client, mock_client_wrapper): + """Test basic TTS conversion returns bytes.""" + # Setup mock response with audio chunks + mock_response = Mock() + mock_response.iter_bytes.return_value = iter([b"chunk1", b"chunk2", b"chunk3"]) + mock_client_wrapper.request.return_value = mock_response + + # Call convert + audio = tts_client.convert(text="Hello world") + + # Verify we got complete audio as bytes + assert audio == b"chunk1chunk2chunk3" + assert isinstance(audio, bytes) + + # Verify request was made correctly + mock_client_wrapper.request.assert_called_once() + def test_convert_with_reference_id(self, tts_client, mock_client_wrapper): """Test TTS with reference voice ID.""" mock_response = Mock() @@ -74,7 +91,7 @@ def test_convert_with_reference_id(self, tts_client, mock_client_wrapper): mock_client_wrapper.request.return_value = mock_response config = TTSConfig(reference_id="voice_123") - list(tts_client.convert(text="Hello", config=config)) + tts_client.convert(text="Hello", config=config) # Verify reference_id in payload call_args = mock_client_wrapper.request.call_args @@ -87,7 +104,7 @@ def test_convert_with_reference_id_parameter(self, tts_client, mock_client_wrapp mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", reference_id="voice_456")) + tts_client.convert(text="Hello", reference_id="voice_456") # Verify reference_id in payload call_args = mock_client_wrapper.request.call_args @@ -103,11 +120,7 @@ def test_convert_parameter_reference_id_overrides_config( mock_client_wrapper.request.return_value = mock_response config = TTSConfig(reference_id="voice_from_config") - list( - tts_client.convert( - text="Hello", reference_id="voice_from_param", config=config - ) - ) + tts_client.convert(text="Hello", reference_id="voice_from_param", config=config) # Verify parameter reference_id takes precedence call_args = mock_client_wrapper.request.call_args @@ -126,7 +139,7 @@ def test_convert_with_references(self, tts_client, mock_client_wrapper): ] config = TTSConfig(references=references) - list(tts_client.convert(text="Hello", config=config)) + tts_client.convert(text="Hello", config=config) # Verify references in payload call_args = mock_client_wrapper.request.call_args @@ -146,7 +159,7 @@ def test_convert_with_references_parameter(self, tts_client, mock_client_wrapper ReferenceAudio(audio=b"ref_audio_2", text="Sample 2"), ] - list(tts_client.convert(text="Hello", references=references)) + tts_client.convert(text="Hello", references=references) # Verify references in payload call_args = mock_client_wrapper.request.call_args @@ -167,7 +180,7 @@ def test_convert_parameter_references_overrides_config( param_refs = [ReferenceAudio(audio=b"param_audio", text="Param")] config = TTSConfig(references=config_refs) - list(tts_client.convert(text="Hello", references=param_refs, config=config)) + tts_client.convert(text="Hello", references=param_refs, config=config) # Verify parameter references take precedence call_args = mock_client_wrapper.request.call_args @@ -181,7 +194,7 @@ def test_convert_with_different_backend(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", model="s1")) + tts_client.convert(text="Hello", model="s1") # Verify model in headers call_args = mock_client_wrapper.request.call_args @@ -196,7 +209,7 @@ def test_convert_with_prosody(self, tts_client, mock_client_wrapper): prosody = Prosody(speed=1.5, volume=0.5) config = TTSConfig(prosody=prosody) - list(tts_client.convert(text="Hello", config=config)) + tts_client.convert(text="Hello", config=config) # Verify prosody in payload call_args = mock_client_wrapper.request.call_args @@ -221,7 +234,7 @@ def test_convert_with_custom_parameters(self, tts_client, mock_client_wrapper): temperature=0.8, ) - list(tts_client.convert(text="Hello", config=config)) + tts_client.convert(text="Hello", config=config) # Verify parameters in payload call_args = mock_client_wrapper.request.call_args @@ -242,7 +255,7 @@ def test_convert_omit_parameters_not_sent(self, tts_client, mock_client_wrapper) mock_client_wrapper.request.return_value = mock_response # Call with defaults (None values should be excluded) - list(tts_client.convert(text="Hello")) + tts_client.convert(text="Hello") # Verify None params not in payload call_args = mock_client_wrapper.request.call_args @@ -266,14 +279,14 @@ def test_convert_with_request_options(self, tts_client, mock_client_wrapper): timeout=120.0, additional_headers={"X-Custom": "value"} ) - list(tts_client.convert(text="Hello", request_options=request_options)) + tts_client.convert(text="Hello", request_options=request_options) # Verify request_options passed through call_args = mock_client_wrapper.request.call_args assert call_args[1]["request_options"] == request_options - def test_convert_streaming_behavior(self, tts_client, mock_client_wrapper): - """Test that convert returns an iterator that can be consumed.""" + def test_stream_behavior(self, tts_client, mock_client_wrapper): + """Test that stream returns an iterator that can be consumed.""" # Setup mock with multiple chunks mock_response = Mock() chunks = [b"chunk1", b"chunk2", b"chunk3", b""] # Empty chunk should be skipped @@ -281,11 +294,11 @@ def test_convert_streaming_behavior(self, tts_client, mock_client_wrapper): mock_client_wrapper.request.return_value = mock_response # Get iterator - audio_iterator = tts_client.convert(text="Hello") + audio_stream = tts_client.stream(text="Hello") # Consume one chunk at a time result = [] - for chunk in audio_iterator: + for chunk in audio_stream: result.append(chunk) # Empty chunk should be filtered out @@ -297,9 +310,9 @@ def test_convert_empty_response(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([]) mock_client_wrapper.request.return_value = mock_response - audio_chunks = list(tts_client.convert(text="Hello")) + audio = tts_client.convert(text="Hello") - assert audio_chunks == [] + assert audio == b"" def test_convert_with_format_parameter(self, tts_client, mock_client_wrapper): """Test TTS with format as direct parameter.""" @@ -307,7 +320,7 @@ def test_convert_with_format_parameter(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", format="wav")) + tts_client.convert(text="Hello", format="wav") # Verify format in payload call_args = mock_client_wrapper.request.call_args @@ -320,7 +333,7 @@ def test_convert_with_opus_format(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", format="opus")) + tts_client.convert(text="Hello", format="opus") # Verify opus format in payload call_args = mock_client_wrapper.request.call_args @@ -333,7 +346,7 @@ def test_convert_with_latency_parameter(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", latency="normal")) + tts_client.convert(text="Hello", latency="normal") # Verify latency in payload call_args = mock_client_wrapper.request.call_args @@ -346,7 +359,7 @@ def test_convert_with_speed_parameter(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", speed=1.5)) + tts_client.convert(text="Hello", speed=1.5) # Verify speed creates prosody in payload call_args = mock_client_wrapper.request.call_args @@ -362,7 +375,7 @@ def test_convert_parameter_format_overrides_config( mock_client_wrapper.request.return_value = mock_response config = TTSConfig(format="wav") - list(tts_client.convert(text="Hello", format="pcm", config=config)) + tts_client.convert(text="Hello", format="pcm", config=config) # Verify parameter format takes precedence call_args = mock_client_wrapper.request.call_args @@ -378,7 +391,7 @@ def test_convert_parameter_speed_overrides_config_prosody( mock_client_wrapper.request.return_value = mock_response config = TTSConfig(prosody=Prosody(speed=2.0, volume=0.5)) - list(tts_client.convert(text="Hello", speed=1.5, config=config)) + tts_client.convert(text="Hello", speed=1.5, config=config) # Verify parameter speed takes precedence but volume is preserved call_args = mock_client_wrapper.request.call_args @@ -410,8 +423,8 @@ class TestAsyncTTSClient: """Test asynchronous AsyncTTSClient.""" @pytest.mark.asyncio - async def test_convert_basic(self, async_tts_client, async_mock_client_wrapper): - """Test basic async TTS conversion.""" + async def test_stream_basic(self, async_tts_client, async_mock_client_wrapper): + """Test basic async TTS streaming.""" # Setup mock response mock_response = Mock() @@ -422,9 +435,10 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - # Call convert and collect chunks + # Call stream and collect chunks audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello world"): + stream = await async_tts_client.stream(text="Hello world") + async for chunk in stream: audio_chunks.append(chunk) assert audio_chunks == [b"chunk1", b"chunk2", b"chunk3"] @@ -436,6 +450,29 @@ async def async_iter_bytes(): assert call_args[0][0] == "POST" assert call_args[0][1] == "/v1/tts" + @pytest.mark.asyncio + async def test_convert_basic(self, async_tts_client, async_mock_client_wrapper): + """Test basic async TTS conversion returns bytes.""" + # Setup mock response + mock_response = Mock() + + async def async_iter_bytes(): + for chunk in [b"chunk1", b"chunk2", b"chunk3"]: + yield chunk + + mock_response.aiter_bytes = async_iter_bytes + async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) + + # Call convert + audio = await async_tts_client.convert(text="Hello world") + + # Verify we got complete audio as bytes + assert audio == b"chunk1chunk2chunk3" + assert isinstance(audio, bytes) + + # Verify request was made + async_mock_client_wrapper.request.assert_called_once() + @pytest.mark.asyncio async def test_convert_with_reference_id( self, async_tts_client, async_mock_client_wrapper @@ -450,9 +487,7 @@ async def async_iter_bytes(): async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) config = TTSConfig(reference_id="voice_123") - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", config=config): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", config=config) # Verify reference_id in payload call_args = async_mock_client_wrapper.request.call_args @@ -472,11 +507,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert( - text="Hello", reference_id="voice_456" - ): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", reference_id="voice_456") # Verify reference_id in payload call_args = async_mock_client_wrapper.request.call_args @@ -497,11 +528,9 @@ async def async_iter_bytes(): async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) config = TTSConfig(reference_id="voice_from_config") - audio_chunks = [] - async for chunk in async_tts_client.convert( + await async_tts_client.convert( text="Hello", reference_id="voice_from_param", config=config - ): - audio_chunks.append(chunk) + ) # Verify parameter reference_id takes precedence call_args = async_mock_client_wrapper.request.call_args @@ -526,11 +555,7 @@ async def async_iter_bytes(): ReferenceAudio(audio=b"ref_audio_2", text="Sample 2"), ] - audio_chunks = [] - async for chunk in async_tts_client.convert( - text="Hello", references=references - ): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", references=references) # Verify references in payload call_args = async_mock_client_wrapper.request.call_args @@ -556,11 +581,9 @@ async def async_iter_bytes(): param_refs = [ReferenceAudio(audio=b"param_audio", text="Param")] config = TTSConfig(references=config_refs) - audio_chunks = [] - async for chunk in async_tts_client.convert( + await async_tts_client.convert( text="Hello", references=param_refs, config=config - ): - audio_chunks.append(chunk) + ) # Verify parameter references take precedence call_args = async_mock_client_wrapper.request.call_args @@ -584,9 +607,7 @@ async def async_iter_bytes(): prosody = Prosody(speed=2.0, volume=1.0) config = TTSConfig(prosody=prosody) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", config=config): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", config=config) # Verify prosody in payload call_args = async_mock_client_wrapper.request.call_args @@ -607,9 +628,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello"): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello") # Verify OMIT params not in payload call_args = async_mock_client_wrapper.request.call_args @@ -633,11 +652,9 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello"): - audio_chunks.append(chunk) + audio = await async_tts_client.convert(text="Hello") - assert audio_chunks == [] + assert audio == b"" @pytest.mark.asyncio async def test_convert_with_format_parameter( @@ -652,9 +669,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", format="wav"): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", format="wav") # Verify format in payload call_args = async_mock_client_wrapper.request.call_args @@ -674,9 +689,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", latency="normal"): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", latency="normal") # Verify latency in payload call_args = async_mock_client_wrapper.request.call_args @@ -696,9 +709,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", speed=1.5): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", speed=1.5) # Verify speed creates prosody in payload call_args = async_mock_client_wrapper.request.call_args @@ -719,11 +730,7 @@ async def async_iter_bytes(): async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) config = TTSConfig(format="wav") - audio_chunks = [] - async for chunk in async_tts_client.convert( - text="Hello", format="pcm", config=config - ): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", format="pcm", config=config) # Verify parameter format takes precedence call_args = async_mock_client_wrapper.request.call_args @@ -744,11 +751,7 @@ async def async_iter_bytes(): async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) config = TTSConfig(prosody=Prosody(speed=2.0, volume=0.5)) - audio_chunks = [] - async for chunk in async_tts_client.convert( - text="Hello", speed=1.5, config=config - ): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", speed=1.5, config=config) # Verify parameter speed takes precedence but volume is preserved call_args = async_mock_client_wrapper.request.call_args @@ -769,11 +772,9 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert( + await async_tts_client.convert( text="Hello", format="wav", speed=1.3, latency="normal" - ): - audio_chunks.append(chunk) + ) # Verify all parameters in payload call_args = async_mock_client_wrapper.request.call_args