From 642c0f8ded4531b08d9888048de3e0d1c406f381 Mon Sep 17 00:00:00 2001 From: James Ding Date: Wed, 12 Nov 2025 14:51:34 -0600 Subject: [PATCH 01/16] docs: update README to highlight new API and migration guide --- README.md | 250 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 131 insertions(+), 119 deletions(-) diff --git a/README.md b/README.md index 0923942..eddc4ec 100644 --- a/README.md +++ b/README.md @@ -1,205 +1,217 @@ # Fish Audio Python SDK -To provide convenient Python program integration for https://docs.fish.audio. +[![PyPI version](https://badge.fury.io/py/fish-audio-sdk.svg)](https://badge.fury.io/py/fish-audio-sdk) +[![PyPI - Downloads](https://img.shields.io/pypi/dm/fish-audio-sdk)](https://pypi.org/project/fish-audio-sdk/) +[![Build Status](https://img.shields.io/github/actions/workflow/status/fishaudio/fish-audio-python/ci.yml?branch=main)](https://github.com/fishaudio/fish-audio-python/actions) +[![codecov](https://codecov.io/gh/fishaudio/fish-audio-python/branch/main/graph/badge.svg)](https://codecov.io/gh/fishaudio/fish-audio-python) +[![Python Version](https://img.shields.io/pypi/pyversions/fish-audio-sdk)](https://pypi.org/project/fish-audio-sdk/) +[![License](https://img.shields.io/github/license/fishaudio/fish-audio-python)](https://github.com/fishaudio/fish-audio-python/blob/main/LICENSE) -## Install +The official Python library for the Fish Audio API - AI-powered text-to-speech, voice cloning, and speech recognition. -```bash -pip install fish-audio-sdk -``` -> [!NOTE] -> The new release has not officially been released yet - please see legacy SDK documentation for now. +[Documentation](https://docs.fish.audio) | [API Reference](https://docs.fish.audio) | [Examples](./examples/) | [Discord](https://fish.audio) -## Usage +--- -### New SDK (Recommended) +## Important: New API Available -The new SDK uses the `fishaudio` module: +> **We've released a major update to the Fish Audio Python SDK!** +> +> The new API (`fishaudio` module) offers improved ergonomics, better type safety, and enhanced features. The legacy SDK (`fish_audio_sdk` module) continues to be supported for existing projects, but we recommend using the new API for all new development. +> +> **Migration:** Both APIs are available in the same package. You can migrate at your own pace. See our [Migration Guide](https://docs.fish.audio) for details. -```python -from fishaudio import FishAudio +--- -client = FishAudio(api_key="your_api_key") +## Quick Start + +### Installation + +```bash +pip install fish-audio-sdk ``` -You can customize the base URL: +### Basic Usage ```python from fishaudio import FishAudio +from fishaudio.utils import save -client = FishAudio(api_key="your_api_key", base_url="https://your-proxy-domain") +# Set your API key via environment variable: export FISH_AUDIO_API_KEY="your-api-key" +# Or pass it directly: FishAudio(api_key="your-api-key") +client = FishAudio() + +# Convert text to speech +audio = client.tts.convert(text="Hello from Fish Audio!") +save(audio, "output.mp3") ``` -### Legacy SDK +[Get your API key](https://fish.audio) | [Full Getting Started Guide](https://docs.fish.audio) -The legacy SDK uses the `fish_audio_sdk` module. Initialize a `Session` to use APIs. All APIs have synchronous and asynchronous versions. If you want to use the asynchronous version of the API, you only need to rewrite the original `session.api_call(...)` to `session.api_call.awaitable(...)`. +--- -```python -from fish_audio_sdk import Session +## Key Features -session = Session("your_api_key") -``` +- **Text-to-Speech** - Natural-sounding voice synthesis with multiple voice options +- **Voice Cloning** - Create custom voices using reference audio samples +- **Real-time Streaming** - Low-latency audio generation via WebSocket connections +- **Speech-to-Text (ASR)** - Accurate automatic speech recognition with language detection +- **Voice Management** - Create, update, and organize custom voice models +- **Sync and Async APIs** - Full support for both synchronous and asynchronous operations +- **Type Safety** - Complete type hints with Pydantic models throughout -Sometimes, you may need to change our endpoint to another address. You can use +--- + +## Examples + +### Text-to-Speech ```python -from fish_audio_sdk import Session +from fishaudio import FishAudio +from fishaudio.utils import save -session = Session("your_api_key", base_url="https://your-proxy-domain") +client = FishAudio() +audio = client.tts.convert(text="Hello, world!") +save(audio, "output.mp3") ``` -### Text to speech +### Voice Cloning with Reference Audio ```python -from fish_audio_sdk import Session, TTSRequest +from fishaudio import FishAudio -session = Session("your_api_key") +client = FishAudio() -with open("r.mp3", "wb") as f: - for chunk in session.tts(TTSRequest(text="Hello, world!")): - f.write(chunk) +# Use a reference voice for cloning +with open("reference.wav", "rb") as f: + audio = client.tts.convert( + text="This will sound like the reference voice!", + reference_audio=f.read(), + reference_text="Transcription of the reference audio" + ) ``` -Or use async version: +### Real-time Streaming ```python -import asyncio -import aiofiles - -from fish_audio_sdk import Session, TTSRequest - -session = Session("your_api_key") +from fishaudio import FishAudio +from fishaudio.utils import play +client = FishAudio() -async def main(): - async with aiofiles.open("r.mp3", "wb") as f: - async for chunk in session.tts.awaitable( - TTSRequest(text="Hello, world!"), - ): - await f.write(chunk) - +# Stream audio in real-time +audio_stream = client.tts.stream( + text="This audio streams as it's generated", + latency="balanced" +) -asyncio.run(main()) +play(audio_stream) ``` -#### Reference Audio +### Speech Recognition (ASR) ```python -from fish_audio_sdk import TTSRequest +from fishaudio import FishAudio -TTSRequest( - text="Hello, world!", - reference_id="your_model_id", -) +client = FishAudio() + +# Transcribe audio to text +with open("audio.wav", "rb") as f: + result = client.asr.transcribe(audio=f.read()) + print(result.text) ``` -Or just use `ReferenceAudio` in `TTSRequest`: +### List and Filter Voices ```python -from fish_audio_sdk import TTSRequest, ReferenceAudio - -TTSRequest( - text="Hello, world!", - references=[ - ReferenceAudio( - audio=audio_file.read(), - text="reference audio text", - ) - ], -) -``` +from fishaudio import FishAudio -### List models +client = FishAudio() -```python -models = session.list_models() -print(models) +# List available voices +voices = client.voices.list(language="en") + +for voice in voices: + print(f"{voice.title} - {voice.id}") ``` -Or use async version: +### Async Usage ```python import asyncio - +from fishaudio import AsyncFishAudio async def main(): - models = await session.list_models.awaitable() - print(models) + client = AsyncFishAudio() + audio = await client.tts.convert(text="Async text-to-speech!") + # Process audio... asyncio.run(main()) ``` - - -### Get a model info by id +### Check Account Credits ```python -model = session.get_model("your_model_id") -print(model) +from fishaudio import FishAudio + +client = FishAudio() +credits = client.account.get_credits() +print(f"Remaining credits: {credits.credit}") ``` -Or use async version: +[More examples in /examples directory](./examples/) -```python -import asyncio +--- +## Documentation -async def main(): - model = await session.get_model.awaitable("your_model_id") - print(model) +- [API Reference](https://docs.fish.audio) - Complete API documentation with all parameters and options +- [Tutorials & Guides](https://docs.fish.audio) - Step-by-step tutorials for common use cases +- [Examples](./examples/) - Sample code demonstrating various features +- [Migration Guide](https://docs.fish.audio) - Guide for upgrading from the legacy SDK +--- -asyncio.run(main()) -``` +## Requirements -### Create a model +- Python 3.9 or higher +- Fish Audio API key - [Get one here](https://fish.audio) -```python -model = session.create_model( - title="test", - description="test", - voices=[voice_file.read(), other_voice_file.read()], - cover_image=image_file.read(), -) -print(model) -``` +### Optional Dependencies -Or use async version: +For audio playback utilities: -```python -import asyncio +```bash +pip install fish-audio-sdk[utils] +``` +This installs `sounddevice` and `soundfile` for the `play()` utility function. -async def main(): - model = await session.create_model.awaitable( - title="test", - description="test", - voices=[voice_file.read(), other_voice_file.read()], - cover_image=image_file.read(), - ) - print(model) +--- +## Community & Support -asyncio.run(main()) -``` +- [Discord Community](https://fish.audio) - Join our community for discussions and support +- [GitHub Issues](https://github.com/fishaudio/fish-audio-python/issues) - Report bugs or request features +- [Documentation](https://docs.fish.audio) - Comprehensive guides and API reference +--- -### Delete a model +## License -```python -session.delete_model("your_model_id") -``` +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. -Or use async version: - -```python -import asyncio +--- +## Legacy SDK -async def main(): - await session.delete_model.awaitable("your_model_id") +The legacy `fish_audio_sdk` module is still available for existing projects: +```python +from fish_audio_sdk import Session -asyncio.run(main()) +session = Session("your_api_key") ``` + +We recommend migrating to the new `fishaudio` module for new projects. See our [Migration Guide](https://docs.fish.audio) for assistance. From 810386ed5994c18cd56f7286e3fa61073a27faca Mon Sep 17 00:00:00 2001 From: James Ding Date: Wed, 12 Nov 2025 15:22:03 -0600 Subject: [PATCH 02/16] docs: update README with new badge for Python version and improved build status link --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index eddc4ec..202c53b 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # Fish Audio Python SDK +[![Official SDK](https://img.shields.io/badge/Fish_Audio-Official_SDK-21176d?logo=fishaudio&logoColor=fff&logoSize=auto)](https://fish.audio) [![PyPI version](https://badge.fury.io/py/fish-audio-sdk.svg)](https://badge.fury.io/py/fish-audio-sdk) [![PyPI - Downloads](https://img.shields.io/pypi/dm/fish-audio-sdk)](https://pypi.org/project/fish-audio-sdk/) -[![Build Status](https://img.shields.io/github/actions/workflow/status/fishaudio/fish-audio-python/ci.yml?branch=main)](https://github.com/fishaudio/fish-audio-python/actions) -[![codecov](https://codecov.io/gh/fishaudio/fish-audio-python/branch/main/graph/badge.svg)](https://codecov.io/gh/fishaudio/fish-audio-python) -[![Python Version](https://img.shields.io/pypi/pyversions/fish-audio-sdk)](https://pypi.org/project/fish-audio-sdk/) +[![Build Status](https://img.shields.io/github/actions/workflow/status/fishaudio/fish-audio-python/python.yml?branch=main)](https://github.com/fishaudio/fish-audio-python/actions) +[![codecov](https://img.shields.io/codecov/c/github/fishaudio/fish-audio-python)](https://codecov.io/gh/fishaudio/fish-audio-python) +[![Python Version](https://img.shields.io/badge/python-3.9+-blue)](https://pypi.org/project/fish-audio-sdk/) [![License](https://img.shields.io/github/license/fishaudio/fish-audio-python)](https://github.com/fishaudio/fish-audio-python/blob/main/LICENSE) The official Python library for the Fish Audio API - AI-powered text-to-speech, voice cloning, and speech recognition. From bf2f5dbe991143bc01c0ad925d647c5ecbf2b0a2 Mon Sep 17 00:00:00 2001 From: James Ding Date: Wed, 12 Nov 2025 15:37:39 -0600 Subject: [PATCH 03/16] docs: update README badge to reflect Python SDK branding --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 202c53b..57314e1 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,8 @@ # Fish Audio Python SDK -[![Official SDK](https://img.shields.io/badge/Fish_Audio-Official_SDK-21176d?logo=fishaudio&logoColor=fff&logoSize=auto)](https://fish.audio) +[![Official SDK](https://img.shields.io/badge/Fish_Audio-Python_SDK-21176d?logo=fishaudio&logoColor=fff&logoSize=auto)](https://fish.audio) [![PyPI version](https://badge.fury.io/py/fish-audio-sdk.svg)](https://badge.fury.io/py/fish-audio-sdk) [![PyPI - Downloads](https://img.shields.io/pypi/dm/fish-audio-sdk)](https://pypi.org/project/fish-audio-sdk/) -[![Build Status](https://img.shields.io/github/actions/workflow/status/fishaudio/fish-audio-python/python.yml?branch=main)](https://github.com/fishaudio/fish-audio-python/actions) [![codecov](https://img.shields.io/codecov/c/github/fishaudio/fish-audio-python)](https://codecov.io/gh/fishaudio/fish-audio-python) [![Python Version](https://img.shields.io/badge/python-3.9+-blue)](https://pypi.org/project/fish-audio-sdk/) [![License](https://img.shields.io/github/license/fishaudio/fish-audio-python)](https://github.com/fishaudio/fish-audio-python/blob/main/LICENSE) From cc72716374831afd54ed49259d7f9d5103166d5c Mon Sep 17 00:00:00 2001 From: James Ding Date: Wed, 12 Nov 2025 15:43:25 -0600 Subject: [PATCH 04/16] docs: update README to remove redundant badge and simplify description --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 57314e1..d07a39c 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,12 @@ # Fish Audio Python SDK -[![Official SDK](https://img.shields.io/badge/Fish_Audio-Python_SDK-21176d?logo=fishaudio&logoColor=fff&logoSize=auto)](https://fish.audio) [![PyPI version](https://badge.fury.io/py/fish-audio-sdk.svg)](https://badge.fury.io/py/fish-audio-sdk) +[![Python Version](https://img.shields.io/badge/python-3.9+-blue)](https://pypi.org/project/fish-audio-sdk/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/fish-audio-sdk)](https://pypi.org/project/fish-audio-sdk/) [![codecov](https://img.shields.io/codecov/c/github/fishaudio/fish-audio-python)](https://codecov.io/gh/fishaudio/fish-audio-python) -[![Python Version](https://img.shields.io/badge/python-3.9+-blue)](https://pypi.org/project/fish-audio-sdk/) [![License](https://img.shields.io/github/license/fishaudio/fish-audio-python)](https://github.com/fishaudio/fish-audio-python/blob/main/LICENSE) -The official Python library for the Fish Audio API - AI-powered text-to-speech, voice cloning, and speech recognition. +The official Python library for the Fish Audio API. [Documentation](https://docs.fish.audio) | [API Reference](https://docs.fish.audio) | [Examples](./examples/) | [Discord](https://fish.audio) From 62b955ce5f4f7f0078bc908cbccf9f0ab034f2be Mon Sep 17 00:00:00 2001 From: James Ding Date: Wed, 12 Nov 2025 21:58:04 -0600 Subject: [PATCH 05/16] docs: update README to reflect new API features and improve installation instructions --- README.md | 171 +++++++++++++++++++++++++----------------------------- 1 file changed, 79 insertions(+), 92 deletions(-) diff --git a/README.md b/README.md index d07a39c..3ada700 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Fish Audio Python SDK -[![PyPI version](https://badge.fury.io/py/fish-audio-sdk.svg)](https://badge.fury.io/py/fish-audio-sdk) +[![PyPI version](https://img.shields.io/pypi/v/fish-audio-sdk.svg)](https://badge.fury.io/py/fish-audio-sdk) [![Python Version](https://img.shields.io/badge/python-3.9+-blue)](https://pypi.org/project/fish-audio-sdk/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/fish-audio-sdk)](https://pypi.org/project/fish-audio-sdk/) [![codecov](https://img.shields.io/codecov/c/github/fishaudio/fish-audio-python)](https://codecov.io/gh/fishaudio/fish-audio-python) @@ -8,80 +8,81 @@ The official Python library for the Fish Audio API. -[Documentation](https://docs.fish.audio) | [API Reference](https://docs.fish.audio) | [Examples](./examples/) | [Discord](https://fish.audio) +## Notice: New API Available ---- +The SDK now includes a modern `fishaudio` API with improved ergonomics, better type safety, and enhanced features. -## Important: New API Available +For new projects, use the `fishaudio` module. For existing projects using the legacy API, see the [Legacy SDK section](#legacy-sdk) below -> **We've released a major update to the Fish Audio Python SDK!** -> -> The new API (`fishaudio` module) offers improved ergonomics, better type safety, and enhanced features. The legacy SDK (`fish_audio_sdk` module) continues to be supported for existing projects, but we recommend using the new API for all new development. -> -> **Migration:** Both APIs are available in the same package. You can migrate at your own pace. See our [Migration Guide](https://docs.fish.audio) for details. +## API Documentation ---- +See the Python API Documentation and Reference -## Quick Start +## Installation -### Installation +This package is available on PyPI: ```bash pip install fish-audio-sdk ``` -### Basic Usage +You may install from source by running the following command in the repository root: -```python -from fishaudio import FishAudio -from fishaudio.utils import save - -# Set your API key via environment variable: export FISH_AUDIO_API_KEY="your-api-key" -# Or pass it directly: FishAudio(api_key="your-api-key") -client = FishAudio() - -# Convert text to speech -audio = client.tts.convert(text="Hello from Fish Audio!") -save(audio, "output.mp3") +```bash +python -m pip install . ``` -[Get your API key](https://fish.audio) | [Full Getting Started Guide](https://docs.fish.audio) +## Usage ---- +The client will need to be configured with an API key, which you can obtain from [Fish Audio](https://fish.audio/app/api-keys). -## Key Features +```python +from fishaudio import FishAudio -- **Text-to-Speech** - Natural-sounding voice synthesis with multiple voice options -- **Voice Cloning** - Create custom voices using reference audio samples -- **Real-time Streaming** - Low-latency audio generation via WebSocket connections -- **Speech-to-Text (ASR)** - Accurate automatic speech recognition with language detection -- **Voice Management** - Create, update, and organize custom voice models -- **Sync and Async APIs** - Full support for both synchronous and asynchronous operations -- **Type Safety** - Complete type hints with Pydantic models throughout +client = FishAudio() # Automatically reads from the FISH_AUDIO_API_KEY environment variable ---- +client = FishAudio(api_key="your-api-key") # Or provide the API key directly +``` -## Examples +The SDK provides [text-to-speech](#text-to-speech), [voice cloning](#instant-voice-cloning), [speech recognition](#speech-recognition-asr), and [voice management](#voice-management) capabilities. ### Text-to-Speech +Convert text to natural-sounding speech with support for multiple voices, formats, and real-time streaming. + +#### Basic + ```python from fishaudio import FishAudio -from fishaudio.utils import save +from fishaudio.utils import save, play client = FishAudio() -audio = client.tts.convert(text="Hello, world!") -save(audio, "output.mp3") + +audio = client.tts.convert(text="Hello, world!") # Default voice and settings +play(audio) # Play audio directly + +audio = client.tts.convert(text="Welcome to Fish Audio SDK!") +save(audio, "output.mp3") # You can also save to a file ``` -### Voice Cloning with Reference Audio +#### With Reference Voice + +Use a reference voice ID to ensure consistent voice characteristics across generations: ```python -from fishaudio import FishAudio +# Use an existing voice by ID +audio = client.tts.convert( + text="This will sound like the reference voice!", + reference_id="802e3bc2b27e49c2995d23ef70e6ac89" # Energetic Male +) +``` -client = FishAudio() +#### Instant Voice Cloning + +Immediately clone a voice from a short audio sample: -# Use a reference voice for cloning +```python +# Clone a voice from audio sample with open("reference.wav", "rb") as f: audio = client.tts.convert( text="This will sound like the reference voice!", @@ -90,7 +91,9 @@ with open("reference.wav", "rb") as f: ) ``` -### Real-time Streaming +#### Real-time Streaming + +For low-latency and real-time applications, stream audio as text is processed: ```python from fishaudio import FishAudio @@ -98,17 +101,20 @@ from fishaudio.utils import play client = FishAudio() -# Stream audio in real-time -audio_stream = client.tts.stream( - text="This audio streams as it's generated", - latency="balanced" -) +# Stream text chunks and receive audio in real-time +def text_chunks(): + yield "Hello, " + yield "this is " + yield "streaming audio!" +audio_stream = client.tts.stream_websocket(text_chunks(), latency="balanced") play(audio_stream) ``` ### Speech Recognition (ASR) +To transcribe audio to text: + ```python from fishaudio import FishAudio @@ -120,7 +126,9 @@ with open("audio.wav", "rb") as f: print(result.text) ``` -### List and Filter Voices +### Voice Management + +Manage voice references and list available voices. ```python from fishaudio import FishAudio @@ -128,14 +136,24 @@ from fishaudio import FishAudio client = FishAudio() # List available voices -voices = client.voices.list(language="en") +voices = client.voices.list(language="en", tags="male") + +# Get a specific voice by ID +voice = client.voices.get(voice_id="802e3bc2b27e49c2995d23ef70e6ac89") -for voice in voices: - print(f"{voice.title} - {voice.id}") +# Create a custom voice +with open("voice_sample.wav", "rb") as f: + new_voice = client.voices.create( + title="My Custom Voice", + voices=[f.read()], + description="My cloned voice" + ) ``` ### Async Usage +You can also use the SDK in asynchronous applications: + ```python import asyncio from fishaudio import AsyncFishAudio @@ -149,7 +167,9 @@ async def main(): asyncio.run(main()) ``` -### Check Account Credits +### Account + +Check your remaining API credits, usage, and account details: ```python from fishaudio import FishAudio @@ -159,53 +179,18 @@ credits = client.account.get_credits() print(f"Remaining credits: {credits.credit}") ``` -[More examples in /examples directory](./examples/) - ---- - -## Documentation - -- [API Reference](https://docs.fish.audio) - Complete API documentation with all parameters and options -- [Tutorials & Guides](https://docs.fish.audio) - Step-by-step tutorials for common use cases -- [Examples](./examples/) - Sample code demonstrating various features -- [Migration Guide](https://docs.fish.audio) - Guide for upgrading from the legacy SDK - ---- - -## Requirements - -- Python 3.9 or higher -- Fish Audio API key - [Get one here](https://fish.audio) ### Optional Dependencies -For audio playback utilities: +For audio playback utilities to help with playing and saving audio files, install the `utils` extra: ```bash pip install fish-audio-sdk[utils] ``` -This installs `sounddevice` and `soundfile` for the `play()` utility function. - ---- - -## Community & Support - -- [Discord Community](https://fish.audio) - Join our community for discussions and support -- [GitHub Issues](https://github.com/fishaudio/fish-audio-python/issues) - Report bugs or request features -- [Documentation](https://docs.fish.audio) - Comprehensive guides and API reference - ---- - -## License - -This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. - ---- - ## Legacy SDK -The legacy `fish_audio_sdk` module is still available for existing projects: +The legacy `fish_audio_sdk` module continues to be supported for existing projects: ```python from fish_audio_sdk import Session @@ -213,4 +198,6 @@ from fish_audio_sdk import Session session = Session("your_api_key") ``` -We recommend migrating to the new `fishaudio` module for new projects. See our [Migration Guide](https://docs.fish.audio) for assistance. +For complete legacy SDK documentation, see the [Legacy API Documentation](https://docs.fish.audio/legacy). + +We recommend migrating to the new `fishaudio` module - see our [Migration Guide](https://docs.fish.audio) for assistance. From cc8416b971f845b25eebc46ea0ad99935598ad7d Mon Sep 17 00:00:00 2001 From: James Ding Date: Wed, 12 Nov 2025 22:04:43 -0600 Subject: [PATCH 06/16] docs: update README to provide direct links to Python SDK Guide and API Reference --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3ada700..0acd0ab 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ For new projects, use the `fishaudio` module. For existing projects using the le ## API Documentation -See the Python API Documentation and Reference +For complete documentation and API reference, visit the [Python SDK Guide](https://docs.fish.audio/developer-guide/sdk-guide/python/) and [API Reference](https://docs.fish.audio/api-reference/sdk/python/). ## Installation From 3cc4904e116374c0660b9c11e970804d1b3248d3 Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 00:04:53 -0600 Subject: [PATCH 07/16] docs: update title in copy_docs.py from "Python SDK" to "Overview" --- scripts/copy_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/copy_docs.py b/scripts/copy_docs.py index a2587eb..ea82907 100644 --- a/scripts/copy_docs.py +++ b/scripts/copy_docs.py @@ -142,7 +142,7 @@ def copy_docs(sdk_root: Path, docs_root: Path) -> None: python_sdk_dir, lambda content: add_frontmatter( content, - title="Python SDK", + title="Overview", description="Fish Audio Python SDK for text-to-speech and voice cloning", icon="python", ), From 27a99094887a78c8df64a7f3980759fc317d9b9e Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 00:39:16 -0600 Subject: [PATCH 08/16] chore: update development status to Production/Stable in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8aa9dfb..49c8d9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ readme = "README.md" license = {text = "Apache-2.0"} keywords = ["fish-audio", "tts", "text-to-speech", "voice-cloning", "ai", "speech-synthesis"] classifiers = [ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", From 6228352d673064b6eb58c113062a738a46884c88 Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 01:41:11 -0600 Subject: [PATCH 09/16] chore: rename environment variable from FISH_AUDIO_API_KEY to FISH_API_KEY across the codebase --- .env.example | 2 +- .github/workflows/python.yml | 2 +- README.md | 2 +- examples/README.md | 2 +- examples/getting-started/01_simple_tts.py | 6 +++--- examples/getting-started/02_play_audio.py | 4 ++-- examples/getting-started/03_check_credits.py | 6 +++--- src/fishaudio/client.py | 4 ++-- src/fishaudio/core/client_wrapper.py | 4 ++-- tests/integration/conftest.py | 4 ++-- tests/unit/test_client.py | 2 +- tests/unit/test_core.py | 2 +- 12 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.env.example b/.env.example index 9ab9793..8ccad9e 100644 --- a/.env.example +++ b/.env.example @@ -1 +1 @@ -FISH_AUDIO_API_KEY= \ No newline at end of file +FISH_API_KEY= \ No newline at end of file diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 619f999..79de8c8 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -83,7 +83,7 @@ jobs: - name: Run integration tests run: uv run pytest tests/integration/ -v env: - FISH_AUDIO_API_KEY: ${{ secrets.FISH_AUDIO_API_KEY }} + FISH_API_KEY: ${{ secrets.FISH_API_KEY }} - name: Upload Test Artifacts uses: actions/upload-artifact@v4 diff --git a/README.md b/README.md index 0acd0ab..b6814e6 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ The client will need to be configured with an API key, which you can obtain from ```python from fishaudio import FishAudio -client = FishAudio() # Automatically reads from the FISH_AUDIO_API_KEY environment variable +client = FishAudio() # Automatically reads from the FISH_API_KEY environment variable client = FishAudio(api_key="your-api-key") # Or provide the API key directly ``` diff --git a/examples/README.md b/examples/README.md index cc7510d..1132cd1 100644 --- a/examples/README.md +++ b/examples/README.md @@ -5,5 +5,5 @@ Example scripts demonstrating how to use the Fish Audio Python SDK. ```bash # Install and setup pip install fishaudio -export FISH_AUDIO_API_KEY="your_api_key" +export FISH_API_KEY="your_api_key" ``` \ No newline at end of file diff --git a/examples/getting-started/01_simple_tts.py b/examples/getting-started/01_simple_tts.py index f312848..2574ce7 100644 --- a/examples/getting-started/01_simple_tts.py +++ b/examples/getting-started/01_simple_tts.py @@ -10,7 +10,7 @@ pip install fishaudio Environment Setup: - export FISH_AUDIO_API_KEY="your_api_key_here" + export FISH_API_KEY="your_api_key_here" # Or pass api_key directly to the client Expected Output: @@ -25,7 +25,7 @@ def main(): # Initialize the client with your API key - # Option 1: Use environment variable FISH_AUDIO_API_KEY + # Option 1: Use environment variable FISH_API_KEY # Option 2: Pass api_key directly: FishAudio(api_key="your_key") client = FishAudio() @@ -52,4 +52,4 @@ def main(): except Exception as e: print(f"Error: {e}") print("\nMake sure you have set your API key:") - print(" export FISH_AUDIO_API_KEY='your_api_key'") + print(" export FISH_API_KEY='your_api_key'") diff --git a/examples/getting-started/02_play_audio.py b/examples/getting-started/02_play_audio.py index 5b62750..34c8e29 100644 --- a/examples/getting-started/02_play_audio.py +++ b/examples/getting-started/02_play_audio.py @@ -19,7 +19,7 @@ # pip install sounddevice soundfile Environment Setup: - export FISH_AUDIO_API_KEY="your_api_key_here" + export FISH_API_KEY="your_api_key_here" Expected Output: - Plays the generated audio through your speakers @@ -98,7 +98,7 @@ def demo_playback_methods(): except Exception as e: print(f"Error: {e}") print("\nTroubleshooting:") - print("1. Make sure your API key is set: export FISH_AUDIO_API_KEY='your_key'") + print("1. Make sure your API key is set: export FISH_API_KEY='your_key'") print("2. Install ffmpeg for audio playback:") print(" - macOS: brew install ffmpeg") print(" - Ubuntu: sudo apt install ffmpeg") diff --git a/examples/getting-started/03_check_credits.py b/examples/getting-started/03_check_credits.py index 68fc721..87412ec 100644 --- a/examples/getting-started/03_check_credits.py +++ b/examples/getting-started/03_check_credits.py @@ -13,7 +13,7 @@ pip install fishaudio Environment Setup: - export FISH_AUDIO_API_KEY="your_api_key_here" + export FISH_API_KEY="your_api_key_here" Expected Output: - Displays account credit balance @@ -84,7 +84,7 @@ def check_api_setup(): print(f" Error: {e}") print("\nPlease check:") print(" 1. Your API key is correct") - print(" 2. Environment variable is set: export FISH_AUDIO_API_KEY='your_key'") + print(" 2. Environment variable is set: export FISH_API_KEY='your_key'") print(" 3. You have an active internet connection") return False @@ -100,6 +100,6 @@ def check_api_setup(): except Exception as e: print(f"\nError: {e}") print("\nMake sure you have set your API key:") - print(" export FISH_AUDIO_API_KEY='your_api_key'") + print(" export FISH_API_KEY='your_api_key'") print("\nOr pass it directly when creating the client:") print(" client = FishAudio(api_key='your_api_key')") diff --git a/src/fishaudio/client.py b/src/fishaudio/client.py index 5a914cf..53be1ec 100644 --- a/src/fishaudio/client.py +++ b/src/fishaudio/client.py @@ -51,7 +51,7 @@ def __init__( Initialize Fish Audio client. Args: - api_key: API key (can also use FISH_AUDIO_API_KEY env var) + api_key: API key (can also use FISH_API_KEY env var) base_url: API base URL timeout: Request timeout in seconds httpx_client: Optional custom HTTP client @@ -145,7 +145,7 @@ def __init__( Initialize async Fish Audio client. Args: - api_key: API key (can also use FISH_AUDIO_API_KEY env var) + api_key: API key (can also use FISH_API_KEY env var) base_url: API base URL timeout: Request timeout in seconds httpx_client: Optional custom async HTTP client diff --git a/src/fishaudio/core/client_wrapper.py b/src/fishaudio/core/client_wrapper.py index 2173f28..f1232f7 100644 --- a/src/fishaudio/core/client_wrapper.py +++ b/src/fishaudio/core/client_wrapper.py @@ -53,10 +53,10 @@ def __init__( api_key: Optional[str] = None, base_url: str = "https://api.fish.audio", ): - self.api_key = api_key or os.getenv("FISH_AUDIO_API_KEY") + self.api_key = api_key or os.getenv("FISH_API_KEY") if not self.api_key: raise ValueError( - "API key must be provided either as argument or via FISH_AUDIO_API_KEY environment variable" + "API key must be provided either as argument or via FISH_API_KEY environment variable" ) self.base_url = base_url diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 00ec2d8..c2b2094 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -21,9 +21,9 @@ @pytest.fixture def api_key(): """Get API key from environment.""" - key = os.getenv("FISH_AUDIO_API_KEY") + key = os.getenv("FISH_API_KEY") if not key: - pytest.skip("No API key available (set FISH_AUDIO_API_KEY)") + pytest.skip("No API key available (set FISH_API_KEY)") return key diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index f1aa2c0..d288491 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -20,7 +20,7 @@ def test_init_with_api_key(self, mock_api_key): assert client._client_wrapper.api_key == mock_api_key def test_init_with_env_var(self, mock_api_key): - with patch.dict("os.environ", {"FISH_AUDIO_API_KEY": mock_api_key}): + with patch.dict("os.environ", {"FISH_API_KEY": mock_api_key}): client = FishAudio() assert client._client_wrapper.api_key == mock_api_key diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 76a3611..f77dc04 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -67,7 +67,7 @@ def test_init_without_api_key_raises(self): ClientWrapper() def test_init_with_env_var(self, mock_api_key): - with patch.dict("os.environ", {"FISH_AUDIO_API_KEY": mock_api_key}): + with patch.dict("os.environ", {"FISH_API_KEY": mock_api_key}): wrapper = ClientWrapper() assert wrapper.api_key == mock_api_key From b5b525838ba0fdd7535704c1324d7f74aa88eff8 Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 03:44:44 -0600 Subject: [PATCH 10/16] feat: add streaming support for text-to-speech with AudioStream and AsyncAudioStream classes --- README.md | 18 +- src/fishaudio/__init__.py | 4 + src/fishaudio/core/iterators.py | 113 ++++++++++ src/fishaudio/resources/tts.py | 244 ++++++++++++++-------- tests/integration/conftest.py | 11 +- tests/integration/test_asr_integration.py | 12 +- tests/integration/test_tts_integration.py | 67 +++--- tests/unit/test_tts.py | 179 ++++++++-------- 8 files changed, 417 insertions(+), 231 deletions(-) create mode 100644 src/fishaudio/core/iterators.py diff --git a/README.md b/README.md index b6814e6..c57d150 100644 --- a/README.md +++ b/README.md @@ -91,9 +91,23 @@ with open("reference.wav", "rb") as f: ) ``` -#### Real-time Streaming +#### Streaming Audio Chunks -For low-latency and real-time applications, stream audio as text is processed: +For processing audio chunks as they're generated: + +```python +# Stream and process audio chunks +for chunk in client.tts.stream(text="Long text content..."): + # Process each chunk as it arrives + send_to_websocket(chunk) + +# Or collect all chunks +audio = client.tts.stream(text="Hello!").collect() +``` + +#### Real-time WebSocket Streaming + +For low-latency bidirectional streaming where you send text chunks and receive audio in real-time: ```python from fishaudio import FishAudio diff --git a/src/fishaudio/__init__.py b/src/fishaudio/__init__.py index bf33f15..dcedf83 100644 --- a/src/fishaudio/__init__.py +++ b/src/fishaudio/__init__.py @@ -28,6 +28,7 @@ from ._version import __version__ from .client import AsyncFishAudio, FishAudio +from .core.iterators import AsyncAudioStream, AudioStream from .exceptions import ( APIError, AuthenticationError, @@ -52,6 +53,9 @@ "play", "save", "stream", + # Audio streams + "AudioStream", + "AsyncAudioStream", # Types "FlushEvent", "TextEvent", diff --git a/src/fishaudio/core/iterators.py b/src/fishaudio/core/iterators.py new file mode 100644 index 0000000..971618a --- /dev/null +++ b/src/fishaudio/core/iterators.py @@ -0,0 +1,113 @@ +"""Audio stream wrappers with collection utilities.""" + +from typing import AsyncIterator, Iterator + + +class AudioStream: + """Wrapper for sync audio byte streams with collection utilities. + + This class wraps an iterator of audio bytes and provides a convenient + `.collect()` method to gather all chunks into a single bytes object. + + Examples: + ```python + from fishaudio import FishAudio + + client = FishAudio(api_key="...") + + # Collect all audio at once + audio = client.tts.convert(text="Hello!").collect() + + # Or stream chunks manually + for chunk in client.tts.convert(text="Hello!"): + process_chunk(chunk) + ``` + """ + + def __init__(self, iterator: Iterator[bytes]): + """Initialize the audio iterator wrapper. + + Args: + iterator: The underlying iterator of audio bytes + """ + self._iter = iterator + + def __iter__(self) -> Iterator[bytes]: + """Allow direct iteration over audio chunks.""" + return self._iter + + def collect(self) -> bytes: + """Collect all audio chunks into a single bytes object. + + This consumes the iterator and returns all audio data as bytes. + After calling this method, the iterator cannot be used again. + + Returns: + Complete audio data as bytes + + Examples: + ```python + audio = client.tts.convert(text="Hello!").collect() + with open("output.mp3", "wb") as f: + f.write(audio) + ``` + """ + chunks = [] + for chunk in self._iter: + chunks.append(chunk) + return b"".join(chunks) + + +class AsyncAudioStream: + """Wrapper for async audio byte streams with collection utilities. + + This class wraps an async iterator of audio bytes and provides a convenient + `.collect()` method to gather all chunks into a single bytes object. + + Examples: + ```python + from fishaudio import AsyncFishAudio + + client = AsyncFishAudio(api_key="...") + + # Collect all audio at once + audio = await client.tts.convert(text="Hello!").collect() + + # Or stream chunks manually + async for chunk in client.tts.convert(text="Hello!"): + await process_chunk(chunk) + ``` + """ + + def __init__(self, async_iterator: AsyncIterator[bytes]): + """Initialize the async audio iterator wrapper. + + Args: + async_iterator: The underlying async iterator of audio bytes + """ + self._iter = async_iterator + + def __aiter__(self) -> AsyncIterator[bytes]: + """Allow direct async iteration over audio chunks.""" + return self._iter + + async def collect(self) -> bytes: + """Collect all audio chunks into a single bytes object. + + This consumes the async iterator and returns all audio data as bytes. + After calling this method, the iterator cannot be used again. + + Returns: + Complete audio data as bytes + + Examples: + ```python + audio = await client.tts.convert(text="Hello!").collect() + with open("output.mp3", "wb") as f: + f.write(audio) + ``` + """ + chunks = [] + async for chunk in self._iter: + chunks.append(chunk) + return b"".join(chunks) diff --git a/src/fishaudio/resources/tts.py b/src/fishaudio/resources/tts.py index fef1cd4..578b676 100644 --- a/src/fishaudio/resources/tts.py +++ b/src/fishaudio/resources/tts.py @@ -9,6 +9,7 @@ from .realtime import aiter_websocket_audio, iter_websocket_audio from ..core import AsyncClientWrapper, ClientWrapper, RequestOptions +from ..core.iterators import AsyncAudioStream, AudioStream from ..types import ( AudioFormat, CloseEvent, @@ -58,7 +59,7 @@ class TTSClient: def __init__(self, client_wrapper: ClientWrapper): self._client = client_wrapper - def convert( + def stream( self, *, text: str, @@ -70,9 +71,9 @@ def convert( config: TTSConfig = TTSConfig(), model: Model = "s1", request_options: Optional[RequestOptions] = None, - ) -> Iterator[bytes]: + ) -> AudioStream: """ - Convert text to speech. + Stream text-to-speech audio chunks. Args: text: Text to synthesize @@ -86,48 +87,20 @@ def convert( request_options: Request-level overrides Returns: - Iterator of audio bytes + AudioStream object that can be iterated for audio chunks Example: ```python - from fishaudio import FishAudio, TTSConfig, ReferenceAudio + from fishaudio import FishAudio client = FishAudio(api_key="...") - # Simple usage with defaults - audio = client.tts.convert(text="Hello world") - - # With format parameter - audio = client.tts.convert(text="Hello world", format="wav") - - # With speed parameter - audio = client.tts.convert(text="Hello world", speed=1.5) - - # With reference_id parameter - audio = client.tts.convert(text="Hello world", reference_id="your_model_id") - - # With references parameter - audio = client.tts.convert( - text="Hello world", - references=[ReferenceAudio(audio=audio_bytes, text="sample")] - ) - - # Combine multiple parameters - audio = client.tts.convert( - text="Hello world", - format="wav", - speed=1.2, - latency="normal" - ) - - # Parameters override config values - config = TTSConfig(format="mp3", prosody=Prosody(speed=1.0)) - audio = client.tts.convert(text="Hello world", format="wav", config=config) - # Result: format="wav" (parameter wins) + # Stream and process chunks + for chunk in client.tts.stream(text="Hello world"): + process_audio_chunk(chunk) - with open("output.mp3", "wb") as f: - for chunk in audio: - f.write(chunk) + # Or collect all at once + audio = client.tts.stream(text="Hello world").collect() ``` """ # Build request payload from config @@ -160,10 +133,75 @@ def convert( request_options=request_options, ) - # Stream response chunks - for chunk in response.iter_bytes(): - if chunk: - yield chunk + # Create generator and wrap with AudioStream + def _stream(): + for chunk in response.iter_bytes(): + if chunk: + yield chunk + + return AudioStream(_stream()) + + def convert( + self, + *, + text: str, + reference_id: Optional[str] = None, + references: Optional[List[ReferenceAudio]] = None, + format: Optional[AudioFormat] = None, + latency: Optional[LatencyMode] = None, + speed: Optional[float] = None, + config: TTSConfig = TTSConfig(), + model: Model = "s1", + request_options: Optional[RequestOptions] = None, + ) -> bytes: + """ + Convert text to speech and return complete audio as bytes. + + This is a convenience method that streams all audio chunks and combines them. + For chunk-by-chunk processing, use stream() instead. + + Args: + text: Text to synthesize + reference_id: Voice reference ID (overrides config.reference_id if provided) + references: Reference audio samples (overrides config.references if provided) + format: Audio format - "mp3", "wav", "pcm", or "opus" (overrides config.format if provided) + latency: Latency mode - "normal" or "balanced" (overrides config.latency if provided) + speed: Speech speed multiplier, e.g. 1.5 for 1.5x speed (overrides config.prosody.speed if provided) + config: TTS configuration (audio settings, voice, model parameters) + model: TTS model to use + request_options: Request-level overrides + + Returns: + Complete audio as bytes + + Example: + ```python + from fishaudio import FishAudio + from fishaudio.utils import play, save + + client = FishAudio(api_key="...") + + # Get complete audio + audio = client.tts.convert(text="Hello world") + + # Play it + play(audio) + + # Or save it + save(audio, "output.mp3") + ``` + """ + return self.stream( + text=text, + reference_id=reference_id, + references=references, + format=format, + latency=latency, + speed=speed, + config=config, + model=model, + request_options=request_options, + ).collect() def stream_websocket( self, @@ -307,7 +345,7 @@ class AsyncTTSClient: def __init__(self, client_wrapper: AsyncClientWrapper): self._client = client_wrapper - async def convert( + async def stream( self, *, text: str, @@ -319,9 +357,9 @@ async def convert( config: TTSConfig = TTSConfig(), model: Model = "s1", request_options: Optional[RequestOptions] = None, - ): + ) -> AsyncAudioStream: """ - Convert text to speech (async). + Stream text-to-speech audio chunks (async). Args: text: Text to synthesize @@ -335,48 +373,20 @@ async def convert( request_options: Request-level overrides Returns: - Async iterator of audio bytes + AsyncAudioStream object that can be iterated for audio chunks Example: ```python - from fishaudio import AsyncFishAudio, TTSConfig, ReferenceAudio + from fishaudio import AsyncFishAudio client = AsyncFishAudio(api_key="...") - # Simple usage with defaults - audio = await client.tts.convert(text="Hello world") - - # With format parameter - audio = await client.tts.convert(text="Hello world", format="wav") - - # With speed parameter - audio = await client.tts.convert(text="Hello world", speed=1.5) - - # With reference_id parameter - audio = await client.tts.convert(text="Hello world", reference_id="your_model_id") - - # With references parameter - audio = await client.tts.convert( - text="Hello world", - references=[ReferenceAudio(audio=audio_bytes, text="sample")] - ) - - # Combine multiple parameters - audio = await client.tts.convert( - text="Hello world", - format="wav", - speed=1.2, - latency="normal" - ) - - # Parameters override config values - config = TTSConfig(format="mp3", prosody=Prosody(speed=1.0)) - audio = await client.tts.convert(text="Hello world", format="wav", config=config) - # Result: format="wav" (parameter wins) + # Stream and process chunks + async for chunk in client.tts.stream(text="Hello world"): + await process_audio_chunk(chunk) - async with aiofiles.open("output.mp3", "wb") as f: - async for chunk in audio: - await f.write(chunk) + # Or collect all at once + audio = await client.tts.stream(text="Hello world").collect() ``` """ # Build request payload from config @@ -409,10 +419,76 @@ async def convert( request_options=request_options, ) - # Stream response chunks - async for chunk in response.aiter_bytes(): - if chunk: - yield chunk + # Create async generator and wrap with AsyncAudioStream + async def _stream(): + async for chunk in response.aiter_bytes(): + if chunk: + yield chunk + + return AsyncAudioStream(_stream()) + + async def convert( + self, + *, + text: str, + reference_id: Optional[str] = None, + references: Optional[List[ReferenceAudio]] = None, + format: Optional[AudioFormat] = None, + latency: Optional[LatencyMode] = None, + speed: Optional[float] = None, + config: TTSConfig = TTSConfig(), + model: Model = "s1", + request_options: Optional[RequestOptions] = None, + ) -> bytes: + """ + Convert text to speech and return complete audio as bytes (async). + + This is a convenience method that streams all audio chunks and combines them. + For chunk-by-chunk processing, use stream() instead. + + Args: + text: Text to synthesize + reference_id: Voice reference ID (overrides config.reference_id if provided) + references: Reference audio samples (overrides config.references if provided) + format: Audio format - "mp3", "wav", "pcm", or "opus" (overrides config.format if provided) + latency: Latency mode - "normal" or "balanced" (overrides config.latency if provided) + speed: Speech speed multiplier, e.g. 1.5 for 1.5x speed (overrides config.prosody.speed if provided) + config: TTS configuration (audio settings, voice, model parameters) + model: TTS model to use + request_options: Request-level overrides + + Returns: + Complete audio as bytes + + Example: + ```python + from fishaudio import AsyncFishAudio + from fishaudio.utils import play, save + + client = AsyncFishAudio(api_key="...") + + # Get complete audio + audio = await client.tts.convert(text="Hello world") + + # Play it + play(audio) + + # Or save it + save(audio, "output.mp3") + ``` + """ + stream = await self.stream( + text=text, + reference_id=reference_id, + references=references, + format=format, + latency=latency, + speed=speed, + config=config, + model=model, + request_options=request_options, + ) + return await stream.collect() async def stream_websocket( self, diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c2b2094..7cc0ef1 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -55,17 +55,20 @@ def save_audio(): A callable that takes audio chunks and filename and saves to output/ """ - def _save(audio_chunks: list[bytes], filename: str) -> Path: - """Save audio chunks to output directory. + def _save(audio: bytes | list[bytes], filename: str) -> Path: + """Save audio to output directory. Args: - audio_chunks: List of audio byte chunks + audio: Audio bytes or list of audio byte chunks filename: Name of the output file (including extension) Returns: Path to the saved file """ - complete_audio = b"".join(audio_chunks) + if isinstance(audio, bytes): + complete_audio = audio + else: + complete_audio = b"".join(audio) output_file = OUTPUT_DIR / filename output_file.write_bytes(complete_audio) return output_file diff --git a/tests/integration/test_asr_integration.py b/tests/integration/test_asr_integration.py index 953f7c8..7e2602e 100644 --- a/tests/integration/test_asr_integration.py +++ b/tests/integration/test_asr_integration.py @@ -13,10 +13,7 @@ def sample_audio(self, client): """Generate sample audio for ASR testing.""" # Generate audio from known text config = TTSConfig(format="wav") - audio_chunks = list( - client.tts.convert(text="Hello world, this is a test.", config=config) - ) - return b"".join(audio_chunks) + return client.tts.convert(text="Hello world, this is a test.", config=config) def test_basic_asr(self, client, sample_audio): """Test basic speech-to-text transcription.""" @@ -54,13 +51,8 @@ class TestAsyncASRIntegration: @pytest.fixture async def async_sample_audio(self, async_client): """Generate sample audio for async ASR testing.""" - audio_chunks = [] config = TTSConfig(format="wav") - async for chunk in async_client.tts.convert( - text="Async test audio", config=config - ): - audio_chunks.append(chunk) - return b"".join(audio_chunks) + return await async_client.tts.convert(text="Async test audio", config=config) @pytest.mark.asyncio async def test_async_basic_asr(self, async_client, async_sample_audio): diff --git a/tests/integration/test_tts_integration.py b/tests/integration/test_tts_integration.py index 8d00d77..f6b4fc2 100644 --- a/tests/integration/test_tts_integration.py +++ b/tests/integration/test_tts_integration.py @@ -13,15 +13,13 @@ class TestTTSIntegration: def test_basic_tts(self, client, save_audio): """Test basic text-to-speech generation.""" - audio_chunks = list(client.tts.convert(text="Hello, this is a test.")) + audio = client.tts.convert(text="Hello, this is a test.") - assert len(audio_chunks) > 0 - # Verify we got audio data (check for common audio headers) - complete_audio = b"".join(audio_chunks) - assert len(complete_audio) > 1000 # Should have substantial audio data + assert len(audio) > 1000 # Should have substantial audio data + assert isinstance(audio, bytes) # Write to output directory - save_audio(audio_chunks, "test_basic_tts.mp3") + save_audio(audio, "test_basic_tts.mp3") def test_tts_with_different_formats(self, client, save_audio): """Test TTS with different audio formats.""" @@ -29,27 +27,23 @@ def test_tts_with_different_formats(self, client, save_audio): for fmt in formats: config = TTSConfig(format=fmt, chunk_length=100) - audio_chunks = list( - client.tts.convert(text=f"Testing format {fmt}", config=config) - ) - assert len(audio_chunks) > 0, f"Failed for format: {fmt}" + audio = client.tts.convert(text=f"Testing format {fmt}", config=config) + assert len(audio) > 0, f"Failed for format: {fmt}" # Write to output directory - save_audio(audio_chunks, f"test_format_{fmt}.{fmt}") + save_audio(audio, f"test_format_{fmt}.{fmt}") def test_tts_with_prosody(self, client, save_audio): """Test TTS with prosody settings.""" prosody = Prosody(speed=1.2, volume=0.5) config = TTSConfig(prosody=prosody) - audio_chunks = list( - client.tts.convert(text="Testing prosody settings", config=config) - ) + audio = client.tts.convert(text="Testing prosody settings", config=config) - assert len(audio_chunks) > 0 + assert len(audio) > 0 # Write to output directory - save_audio(audio_chunks, "test_prosody.mp3") + save_audio(audio, "test_prosody.mp3") def test_tts_with_different_models(self, client, save_audio): """Test TTS with different models.""" @@ -57,13 +51,11 @@ def test_tts_with_different_models(self, client, save_audio): for model in models: try: - audio_chunks = list( - client.tts.convert(text=f"Testing model {model}", model=model) - ) - assert len(audio_chunks) > 0, f"Failed for model: {model}" + audio = client.tts.convert(text=f"Testing model {model}", model=model) + assert len(audio) > 0, f"Failed for model: {model}" # Write to output directory - save_audio(audio_chunks, f"test_model_{model}.mp3") + save_audio(audio, f"test_model_{model}.mp3") except Exception as e: # Some models might not be available pytest.skip(f"Model {model} not available: {e}") @@ -73,23 +65,21 @@ def test_tts_longer_text(self, client, save_audio): long_text = "This is a longer piece of text for testing. " * 10 config = TTSConfig(chunk_length=200) - audio_chunks = list(client.tts.convert(text=long_text, config=config)) + audio = client.tts.convert(text=long_text, config=config) - assert len(audio_chunks) > 0 - complete_audio = b"".join(audio_chunks) # Longer text should produce more audio - assert len(complete_audio) > 5000 + assert len(audio) > 5000 # Write to output directory - save_audio(audio_chunks, "test_longer_text.mp3") + save_audio(audio, "test_longer_text.mp3") def test_tts_empty_text_should_fail(self, client): """Test that empty text is handled.""" # This might succeed with silence or fail - test behavior try: - audio_chunks = list(client.tts.convert(text="")) + audio = client.tts.convert(text="") # If it succeeds, verify we get something - assert len(audio_chunks) >= 0 + assert isinstance(audio, bytes) except Exception: # If it fails, that's also acceptable pass @@ -101,16 +91,13 @@ class TestAsyncTTSIntegration: @pytest.mark.asyncio async def test_basic_async_tts(self, async_client, save_audio): """Test basic async text-to-speech generation.""" - audio_chunks = [] - async for chunk in async_client.tts.convert(text="Hello from async"): - audio_chunks.append(chunk) + audio = await async_client.tts.convert(text="Hello from async") - assert len(audio_chunks) > 0 - complete_audio = b"".join(audio_chunks) - assert len(complete_audio) > 1000 + assert len(audio) > 1000 + assert isinstance(audio, bytes) # Write to output directory - save_audio(audio_chunks, "test_async_basic.mp3") + save_audio(audio, "test_async_basic.mp3") @pytest.mark.asyncio async def test_async_tts_with_prosody(self, async_client, save_audio): @@ -118,13 +105,9 @@ async def test_async_tts_with_prosody(self, async_client, save_audio): prosody = Prosody(speed=0.8, volume=-0.2) config = TTSConfig(prosody=prosody) - audio_chunks = [] - async for chunk in async_client.tts.convert( - text="Async prosody test", config=config - ): - audio_chunks.append(chunk) + audio = await async_client.tts.convert(text="Async prosody test", config=config) - assert len(audio_chunks) > 0 + assert len(audio) > 0 # Write to output directory - save_audio(audio_chunks, "test_async_prosody.mp3") + save_audio(audio, "test_async_prosody.mp3") diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py index 6ddff60..47bfb06 100644 --- a/tests/unit/test_tts.py +++ b/tests/unit/test_tts.py @@ -40,15 +40,15 @@ def async_tts_client(async_mock_client_wrapper): class TestTTSClient: """Test synchronous TTSClient.""" - def test_convert_basic(self, tts_client, mock_client_wrapper): - """Test basic TTS conversion.""" + def test_stream_basic(self, tts_client, mock_client_wrapper): + """Test basic TTS streaming.""" # Setup mock response with audio chunks mock_response = Mock() mock_response.iter_bytes.return_value = iter([b"chunk1", b"chunk2", b"chunk3"]) mock_client_wrapper.request.return_value = mock_response - # Call convert - audio_chunks = list(tts_client.convert(text="Hello world")) + # Call stream + audio_chunks = list(tts_client.stream(text="Hello world")) # Verify we got chunks back assert audio_chunks == [b"chunk1", b"chunk2", b"chunk3"] @@ -67,6 +67,23 @@ def test_convert_basic(self, tts_client, mock_client_wrapper): # Check payload was msgpack encoded assert "content" in call_args[1] + def test_convert_basic(self, tts_client, mock_client_wrapper): + """Test basic TTS conversion returns bytes.""" + # Setup mock response with audio chunks + mock_response = Mock() + mock_response.iter_bytes.return_value = iter([b"chunk1", b"chunk2", b"chunk3"]) + mock_client_wrapper.request.return_value = mock_response + + # Call convert + audio = tts_client.convert(text="Hello world") + + # Verify we got complete audio as bytes + assert audio == b"chunk1chunk2chunk3" + assert isinstance(audio, bytes) + + # Verify request was made correctly + mock_client_wrapper.request.assert_called_once() + def test_convert_with_reference_id(self, tts_client, mock_client_wrapper): """Test TTS with reference voice ID.""" mock_response = Mock() @@ -74,7 +91,7 @@ def test_convert_with_reference_id(self, tts_client, mock_client_wrapper): mock_client_wrapper.request.return_value = mock_response config = TTSConfig(reference_id="voice_123") - list(tts_client.convert(text="Hello", config=config)) + tts_client.convert(text="Hello", config=config) # Verify reference_id in payload call_args = mock_client_wrapper.request.call_args @@ -87,7 +104,7 @@ def test_convert_with_reference_id_parameter(self, tts_client, mock_client_wrapp mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", reference_id="voice_456")) + tts_client.convert(text="Hello", reference_id="voice_456") # Verify reference_id in payload call_args = mock_client_wrapper.request.call_args @@ -103,11 +120,7 @@ def test_convert_parameter_reference_id_overrides_config( mock_client_wrapper.request.return_value = mock_response config = TTSConfig(reference_id="voice_from_config") - list( - tts_client.convert( - text="Hello", reference_id="voice_from_param", config=config - ) - ) + tts_client.convert(text="Hello", reference_id="voice_from_param", config=config) # Verify parameter reference_id takes precedence call_args = mock_client_wrapper.request.call_args @@ -126,7 +139,7 @@ def test_convert_with_references(self, tts_client, mock_client_wrapper): ] config = TTSConfig(references=references) - list(tts_client.convert(text="Hello", config=config)) + tts_client.convert(text="Hello", config=config) # Verify references in payload call_args = mock_client_wrapper.request.call_args @@ -146,7 +159,7 @@ def test_convert_with_references_parameter(self, tts_client, mock_client_wrapper ReferenceAudio(audio=b"ref_audio_2", text="Sample 2"), ] - list(tts_client.convert(text="Hello", references=references)) + tts_client.convert(text="Hello", references=references) # Verify references in payload call_args = mock_client_wrapper.request.call_args @@ -167,7 +180,7 @@ def test_convert_parameter_references_overrides_config( param_refs = [ReferenceAudio(audio=b"param_audio", text="Param")] config = TTSConfig(references=config_refs) - list(tts_client.convert(text="Hello", references=param_refs, config=config)) + tts_client.convert(text="Hello", references=param_refs, config=config) # Verify parameter references take precedence call_args = mock_client_wrapper.request.call_args @@ -181,7 +194,7 @@ def test_convert_with_different_backend(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", model="s1")) + tts_client.convert(text="Hello", model="s1") # Verify model in headers call_args = mock_client_wrapper.request.call_args @@ -196,7 +209,7 @@ def test_convert_with_prosody(self, tts_client, mock_client_wrapper): prosody = Prosody(speed=1.5, volume=0.5) config = TTSConfig(prosody=prosody) - list(tts_client.convert(text="Hello", config=config)) + tts_client.convert(text="Hello", config=config) # Verify prosody in payload call_args = mock_client_wrapper.request.call_args @@ -221,7 +234,7 @@ def test_convert_with_custom_parameters(self, tts_client, mock_client_wrapper): temperature=0.8, ) - list(tts_client.convert(text="Hello", config=config)) + tts_client.convert(text="Hello", config=config) # Verify parameters in payload call_args = mock_client_wrapper.request.call_args @@ -242,7 +255,7 @@ def test_convert_omit_parameters_not_sent(self, tts_client, mock_client_wrapper) mock_client_wrapper.request.return_value = mock_response # Call with defaults (None values should be excluded) - list(tts_client.convert(text="Hello")) + tts_client.convert(text="Hello") # Verify None params not in payload call_args = mock_client_wrapper.request.call_args @@ -266,14 +279,14 @@ def test_convert_with_request_options(self, tts_client, mock_client_wrapper): timeout=120.0, additional_headers={"X-Custom": "value"} ) - list(tts_client.convert(text="Hello", request_options=request_options)) + tts_client.convert(text="Hello", request_options=request_options) # Verify request_options passed through call_args = mock_client_wrapper.request.call_args assert call_args[1]["request_options"] == request_options - def test_convert_streaming_behavior(self, tts_client, mock_client_wrapper): - """Test that convert returns an iterator that can be consumed.""" + def test_stream_behavior(self, tts_client, mock_client_wrapper): + """Test that stream returns an iterator that can be consumed.""" # Setup mock with multiple chunks mock_response = Mock() chunks = [b"chunk1", b"chunk2", b"chunk3", b""] # Empty chunk should be skipped @@ -281,11 +294,11 @@ def test_convert_streaming_behavior(self, tts_client, mock_client_wrapper): mock_client_wrapper.request.return_value = mock_response # Get iterator - audio_iterator = tts_client.convert(text="Hello") + audio_stream = tts_client.stream(text="Hello") # Consume one chunk at a time result = [] - for chunk in audio_iterator: + for chunk in audio_stream: result.append(chunk) # Empty chunk should be filtered out @@ -297,9 +310,9 @@ def test_convert_empty_response(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([]) mock_client_wrapper.request.return_value = mock_response - audio_chunks = list(tts_client.convert(text="Hello")) + audio = tts_client.convert(text="Hello") - assert audio_chunks == [] + assert audio == b"" def test_convert_with_format_parameter(self, tts_client, mock_client_wrapper): """Test TTS with format as direct parameter.""" @@ -307,7 +320,7 @@ def test_convert_with_format_parameter(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", format="wav")) + tts_client.convert(text="Hello", format="wav") # Verify format in payload call_args = mock_client_wrapper.request.call_args @@ -320,7 +333,7 @@ def test_convert_with_opus_format(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", format="opus")) + tts_client.convert(text="Hello", format="opus") # Verify opus format in payload call_args = mock_client_wrapper.request.call_args @@ -333,7 +346,7 @@ def test_convert_with_latency_parameter(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", latency="normal")) + tts_client.convert(text="Hello", latency="normal") # Verify latency in payload call_args = mock_client_wrapper.request.call_args @@ -346,7 +359,7 @@ def test_convert_with_speed_parameter(self, tts_client, mock_client_wrapper): mock_response.iter_bytes.return_value = iter([b"audio"]) mock_client_wrapper.request.return_value = mock_response - list(tts_client.convert(text="Hello", speed=1.5)) + tts_client.convert(text="Hello", speed=1.5) # Verify speed creates prosody in payload call_args = mock_client_wrapper.request.call_args @@ -362,7 +375,7 @@ def test_convert_parameter_format_overrides_config( mock_client_wrapper.request.return_value = mock_response config = TTSConfig(format="wav") - list(tts_client.convert(text="Hello", format="pcm", config=config)) + tts_client.convert(text="Hello", format="pcm", config=config) # Verify parameter format takes precedence call_args = mock_client_wrapper.request.call_args @@ -378,7 +391,7 @@ def test_convert_parameter_speed_overrides_config_prosody( mock_client_wrapper.request.return_value = mock_response config = TTSConfig(prosody=Prosody(speed=2.0, volume=0.5)) - list(tts_client.convert(text="Hello", speed=1.5, config=config)) + tts_client.convert(text="Hello", speed=1.5, config=config) # Verify parameter speed takes precedence but volume is preserved call_args = mock_client_wrapper.request.call_args @@ -410,8 +423,8 @@ class TestAsyncTTSClient: """Test asynchronous AsyncTTSClient.""" @pytest.mark.asyncio - async def test_convert_basic(self, async_tts_client, async_mock_client_wrapper): - """Test basic async TTS conversion.""" + async def test_stream_basic(self, async_tts_client, async_mock_client_wrapper): + """Test basic async TTS streaming.""" # Setup mock response mock_response = Mock() @@ -422,9 +435,10 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - # Call convert and collect chunks + # Call stream and collect chunks audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello world"): + stream = await async_tts_client.stream(text="Hello world") + async for chunk in stream: audio_chunks.append(chunk) assert audio_chunks == [b"chunk1", b"chunk2", b"chunk3"] @@ -436,6 +450,29 @@ async def async_iter_bytes(): assert call_args[0][0] == "POST" assert call_args[0][1] == "/v1/tts" + @pytest.mark.asyncio + async def test_convert_basic(self, async_tts_client, async_mock_client_wrapper): + """Test basic async TTS conversion returns bytes.""" + # Setup mock response + mock_response = Mock() + + async def async_iter_bytes(): + for chunk in [b"chunk1", b"chunk2", b"chunk3"]: + yield chunk + + mock_response.aiter_bytes = async_iter_bytes + async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) + + # Call convert + audio = await async_tts_client.convert(text="Hello world") + + # Verify we got complete audio as bytes + assert audio == b"chunk1chunk2chunk3" + assert isinstance(audio, bytes) + + # Verify request was made + async_mock_client_wrapper.request.assert_called_once() + @pytest.mark.asyncio async def test_convert_with_reference_id( self, async_tts_client, async_mock_client_wrapper @@ -450,9 +487,7 @@ async def async_iter_bytes(): async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) config = TTSConfig(reference_id="voice_123") - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", config=config): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", config=config) # Verify reference_id in payload call_args = async_mock_client_wrapper.request.call_args @@ -472,11 +507,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert( - text="Hello", reference_id="voice_456" - ): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", reference_id="voice_456") # Verify reference_id in payload call_args = async_mock_client_wrapper.request.call_args @@ -497,11 +528,9 @@ async def async_iter_bytes(): async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) config = TTSConfig(reference_id="voice_from_config") - audio_chunks = [] - async for chunk in async_tts_client.convert( + await async_tts_client.convert( text="Hello", reference_id="voice_from_param", config=config - ): - audio_chunks.append(chunk) + ) # Verify parameter reference_id takes precedence call_args = async_mock_client_wrapper.request.call_args @@ -526,11 +555,7 @@ async def async_iter_bytes(): ReferenceAudio(audio=b"ref_audio_2", text="Sample 2"), ] - audio_chunks = [] - async for chunk in async_tts_client.convert( - text="Hello", references=references - ): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", references=references) # Verify references in payload call_args = async_mock_client_wrapper.request.call_args @@ -556,11 +581,9 @@ async def async_iter_bytes(): param_refs = [ReferenceAudio(audio=b"param_audio", text="Param")] config = TTSConfig(references=config_refs) - audio_chunks = [] - async for chunk in async_tts_client.convert( + await async_tts_client.convert( text="Hello", references=param_refs, config=config - ): - audio_chunks.append(chunk) + ) # Verify parameter references take precedence call_args = async_mock_client_wrapper.request.call_args @@ -584,9 +607,7 @@ async def async_iter_bytes(): prosody = Prosody(speed=2.0, volume=1.0) config = TTSConfig(prosody=prosody) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", config=config): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", config=config) # Verify prosody in payload call_args = async_mock_client_wrapper.request.call_args @@ -607,9 +628,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello"): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello") # Verify OMIT params not in payload call_args = async_mock_client_wrapper.request.call_args @@ -633,11 +652,9 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello"): - audio_chunks.append(chunk) + audio = await async_tts_client.convert(text="Hello") - assert audio_chunks == [] + assert audio == b"" @pytest.mark.asyncio async def test_convert_with_format_parameter( @@ -652,9 +669,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", format="wav"): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", format="wav") # Verify format in payload call_args = async_mock_client_wrapper.request.call_args @@ -674,9 +689,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", latency="normal"): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", latency="normal") # Verify latency in payload call_args = async_mock_client_wrapper.request.call_args @@ -696,9 +709,7 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert(text="Hello", speed=1.5): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", speed=1.5) # Verify speed creates prosody in payload call_args = async_mock_client_wrapper.request.call_args @@ -719,11 +730,7 @@ async def async_iter_bytes(): async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) config = TTSConfig(format="wav") - audio_chunks = [] - async for chunk in async_tts_client.convert( - text="Hello", format="pcm", config=config - ): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", format="pcm", config=config) # Verify parameter format takes precedence call_args = async_mock_client_wrapper.request.call_args @@ -744,11 +751,7 @@ async def async_iter_bytes(): async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) config = TTSConfig(prosody=Prosody(speed=2.0, volume=0.5)) - audio_chunks = [] - async for chunk in async_tts_client.convert( - text="Hello", speed=1.5, config=config - ): - audio_chunks.append(chunk) + await async_tts_client.convert(text="Hello", speed=1.5, config=config) # Verify parameter speed takes precedence but volume is preserved call_args = async_mock_client_wrapper.request.call_args @@ -769,11 +772,9 @@ async def async_iter_bytes(): mock_response.aiter_bytes = async_iter_bytes async_mock_client_wrapper.request = AsyncMock(return_value=mock_response) - audio_chunks = [] - async for chunk in async_tts_client.convert( + await async_tts_client.convert( text="Hello", format="wav", speed=1.3, latency="normal" - ): - audio_chunks.append(chunk) + ) # Verify all parameters in payload call_args = async_mock_client_wrapper.request.call_args From 80349873ad7851acc644228a85064f68aafccd41 Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 19:47:02 -0600 Subject: [PATCH 11/16] docs: enhance docstrings for models with detailed attribute descriptions --- src/fishaudio/types/account.py | 25 ++++++++++- src/fishaudio/types/asr.py | 16 ++++++- src/fishaudio/types/shared.py | 9 +++- src/fishaudio/types/tts.py | 77 +++++++++++++++++++++++++++++++--- src/fishaudio/types/voices.py | 42 +++++++++++++++++-- 5 files changed, 154 insertions(+), 15 deletions(-) diff --git a/src/fishaudio/types/account.py b/src/fishaudio/types/account.py index 2803383..966ade5 100644 --- a/src/fishaudio/types/account.py +++ b/src/fishaudio/types/account.py @@ -7,7 +7,17 @@ class Credits(BaseModel): - """User's API credit balance.""" + """User's API credit balance. + + Attributes: + id: Unique credits record identifier + user_id: User identifier + credit: Current credit balance (decimal for precise accounting) + created_at: Timestamp when the credits record was created + updated_at: Timestamp when the credits were last updated + has_phone_sha256: Whether the user has a verified phone number. Optional + has_free_credit: Whether the user has received free credits. Optional + """ model_config = ConfigDict(populate_by_name=True) @@ -21,7 +31,18 @@ class Credits(BaseModel): class Package(BaseModel): - """User's prepaid package information.""" + """User's prepaid package information. + + Attributes: + id: Unique package identifier + user_id: User identifier + type: Package type identifier + total: Total units in the package + balance: Remaining units in the package + created_at: Timestamp when the package was purchased + updated_at: Timestamp when the package was last updated + finished_at: Timestamp when the package was fully consumed. None if still active + """ model_config = ConfigDict(populate_by_name=True) diff --git a/src/fishaudio/types/asr.py b/src/fishaudio/types/asr.py index 84d2dbb..db73916 100644 --- a/src/fishaudio/types/asr.py +++ b/src/fishaudio/types/asr.py @@ -6,7 +6,13 @@ class ASRSegment(BaseModel): - """A timestamped segment of transcribed text.""" + """A timestamped segment of transcribed text. + + Attributes: + text: The transcribed text for this segment + start: Segment start time in seconds + end: Segment end time in seconds + """ text: str start: float @@ -14,7 +20,13 @@ class ASRSegment(BaseModel): class ASRResponse(BaseModel): - """Response from speech-to-text transcription.""" + """Response from speech-to-text transcription. + + Attributes: + text: Complete transcription of the entire audio + duration: Total audio duration in milliseconds + segments: List of timestamped text segments. Empty if include_timestamps=False + """ text: str duration: float # Duration in milliseconds diff --git a/src/fishaudio/types/shared.py b/src/fishaudio/types/shared.py index df7ab4a..1e756d9 100644 --- a/src/fishaudio/types/shared.py +++ b/src/fishaudio/types/shared.py @@ -9,7 +9,12 @@ class PaginatedResponse(BaseModel, Generic[T]): - """Generic paginated response.""" + """Generic paginated response. + + Attributes: + total: Total number of items across all pages + items: List of items on the current page + """ total: int items: List[T] @@ -25,7 +30,7 @@ class PaginatedResponse(BaseModel, Generic[T]): Visibility = Literal["public", "unlist", "private"] # Training mode types -TrainMode = Literal["fast", "full"] +TrainMode = Literal["fast"] # Model state types ModelState = Literal["created", "training", "trained", "failed"] diff --git a/src/fishaudio/types/tts.py b/src/fishaudio/types/tts.py index 4dd7671..8b0923a 100644 --- a/src/fishaudio/types/tts.py +++ b/src/fishaudio/types/tts.py @@ -8,14 +8,27 @@ class ReferenceAudio(BaseModel): - """Reference audio for voice cloning/style.""" + """Reference audio for voice cloning/style. + + Attributes: + audio: Audio file bytes for the reference sample + text: Transcription of what is spoken in the reference audio. Should match exactly + what's spoken and include punctuation for proper prosody. + """ audio: bytes text: str class Prosody(BaseModel): - """Speech prosody settings (speed and volume).""" + """Speech prosody settings (speed and volume). + + Attributes: + speed: Speech speed multiplier. Range: 0.5-2.0. Default: 1.0. + Examples: 1.5 = 50% faster, 0.8 = 20% slower + volume: Volume adjustment in decibels. Range: -20.0 to 20.0. Default: 0.0 (no change). + Positive values increase volume, negative values decrease it. + """ speed: Annotated[float, Field(ge=0.5, le=2.0)] = 1.0 volume: Annotated[float, Field(ge=-20.0, le=20.0)] = 0.0 @@ -45,6 +58,23 @@ class TTSConfig(BaseModel): Reusable configuration for text-to-speech requests. Create once, use multiple times. All parameters have sensible defaults. + + Attributes: + format: Audio output format. Options: "mp3", "wav", "pcm", "opus". Default: "mp3" + sample_rate: Audio sample rate in Hz. If None, uses format-specific default. + mp3_bitrate: MP3 bitrate in kbps. Options: 64, 128, 192. Default: 128 + opus_bitrate: Opus bitrate in kbps. Options: -1000, 24, 32, 48, 64. Default: 32 + normalize: Whether to normalize/clean the input text. Default: True + chunk_length: Characters per generation chunk. Range: 100-300. Default: 200. + Lower values = faster initial response, higher values = better quality + latency: Generation mode. Options: "normal" (higher quality), "balanced" (faster). Default: "balanced" + reference_id: Voice model ID from fish.audio (e.g., "802e3bc2b27e49c2995d23ef70e6ac89"). + Find IDs in voice URLs or via voices.list() + references: List of reference audio samples for instant voice cloning. Default: [] + prosody: Speech speed and volume settings. Default: None (uses natural prosody) + top_p: Nucleus sampling parameter for token selection. Range: 0.0-1.0. Default: 0.7 + temperature: Randomness in generation. Range: 0.0-1.0. Default: 0.7. + Higher = more varied, lower = more consistent """ # Audio output settings @@ -74,6 +104,21 @@ class TTSRequest(BaseModel): This model is used internally for WebSocket streaming. For the HTTP API, parameters are passed directly to methods. + + Attributes: + text: Text to synthesize into speech + chunk_length: Characters per generation chunk. Range: 100-300. Default: 200 + format: Audio output format. Options: "mp3", "wav", "pcm", "opus". Default: "mp3" + sample_rate: Audio sample rate in Hz. If None, uses format-specific default + mp3_bitrate: MP3 bitrate in kbps. Options: 64, 128, 192. Default: 128 + opus_bitrate: Opus bitrate in kbps. Options: -1000, 24, 32, 48, 64. Default: 32 + references: List of reference audio samples for voice cloning. Default: [] + reference_id: Voice model ID for using a specific voice. Default: None + normalize: Whether to normalize/clean the input text. Default: True + latency: Generation mode. Options: "normal", "balanced". Default: "balanced" + prosody: Speech speed and volume settings. Default: None + top_p: Nucleus sampling for token selection. Range: 0.0-1.0. Default: 0.7 + temperature: Randomness in generation. Range: 0.0-1.0. Default: 0.7 """ text: str @@ -93,26 +138,46 @@ class TTSRequest(BaseModel): # WebSocket event types for streaming TTS class StartEvent(BaseModel): - """WebSocket start event.""" + """WebSocket start event to initiate TTS streaming. + + Attributes: + event: Event type identifier, always "start" + request: TTS configuration for the streaming session + """ event: Literal["start"] = "start" request: TTSRequest class TextEvent(BaseModel): - """WebSocket text chunk event.""" + """WebSocket event to send a text chunk for synthesis. + + Attributes: + event: Event type identifier, always "text" + text: Text chunk to synthesize + """ event: Literal["text"] = "text" text: str class FlushEvent(BaseModel): - """WebSocket flush event - forces buffer to generate audio immediately.""" + """WebSocket event to force immediate audio generation from buffered text. + + Use this to ensure all buffered text is synthesized without waiting for more input. + + Attributes: + event: Event type identifier, always "flush" + """ event: Literal["flush"] = "flush" class CloseEvent(BaseModel): - """WebSocket close event.""" + """WebSocket event to end the streaming session. + + Attributes: + event: Event type identifier, always "stop" + """ event: Literal["stop"] = "stop" diff --git a/src/fishaudio/types/voices.py b/src/fishaudio/types/voices.py index 90e41b2..04f7570 100644 --- a/src/fishaudio/types/voices.py +++ b/src/fishaudio/types/voices.py @@ -9,7 +9,14 @@ class Sample(BaseModel): - """A sample audio for a voice model.""" + """A sample audio for a voice model. + + Attributes: + title: Title/name of the audio sample + text: Transcription of the spoken content in the sample + task_id: Unique identifier for the sample task + audio: URL or path to the audio file + """ title: str text: str @@ -18,7 +25,13 @@ class Sample(BaseModel): class Author(BaseModel): - """Voice model author information.""" + """Voice model author information. + + Attributes: + id: Unique author identifier + nickname: Author's display name + avatar: URL to author's avatar image + """ id: str = Field(alias="_id") nickname: str @@ -27,9 +40,32 @@ class Author(BaseModel): class Voice(BaseModel): """ - A voice model + A voice model. Represents a TTS voice that can be used for synthesis. + + Attributes: + id: Unique voice model identifier (use as reference_id in TTS) + type: Model type. Options: "svc" (singing voice conversion), "tts" (text-to-speech) + title: Voice model title/name + description: Detailed description of the voice model + cover_image: URL to the voice model's cover image + train_mode: Training mode used. Options: "fast" + state: Current model state (e.g., "ready", "training", "failed") + tags: List of tags for categorization (e.g., ["male", "english", "young"]) + samples: List of audio samples demonstrating the voice + created_at: Timestamp when the model was created + updated_at: Timestamp when the model was last updated + languages: List of supported language codes (e.g., ["en", "zh"]) + visibility: Model visibility. Options: "public", "private", "unlisted" + lock_visibility: Whether visibility setting is locked + like_count: Number of likes the model has received + mark_count: Number of bookmarks/favorites + shared_count: Number of times the model has been shared + task_count: Number of times the model has been used for generation + liked: Whether the current user has liked this model. Default: False + marked: Whether the current user has bookmarked this model. Default: False + author: Information about the voice model's creator """ id: str = Field(alias="_id") From 1ea482947236d421216ae0e76d84c5b2abec34ef Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 19:52:31 -0600 Subject: [PATCH 12/16] docs: update legacy SDK documentation links in README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c57d150..356be16 100644 --- a/README.md +++ b/README.md @@ -212,6 +212,6 @@ from fish_audio_sdk import Session session = Session("your_api_key") ``` -For complete legacy SDK documentation, see the [Legacy API Documentation](https://docs.fish.audio/legacy). +For complete legacy SDK documentation, see the [Legacy API Documentation](https://docs.fish.audio/archive/python-sdk-legacy). -We recommend migrating to the new `fishaudio` module - see our [Migration Guide](https://docs.fish.audio) for assistance. +We recommend migrating to the new `fishaudio` module - see our [Migration Guide](https://docs.fish.audio/archive/python-sdk-legacy/migration-guide) for assistance. From c68c1801ce7e7f4eaadaed86382caf5a94c2cad4 Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 19:58:33 -0600 Subject: [PATCH 13/16] docs: update README.md to reflect new API features and usage examples --- README.md | 273 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 153 insertions(+), 120 deletions(-) diff --git a/README.md b/README.md index 356be16..1bb11a7 100644 --- a/README.md +++ b/README.md @@ -6,212 +6,245 @@ [![codecov](https://img.shields.io/codecov/c/github/fishaudio/fish-audio-python)](https://codecov.io/gh/fishaudio/fish-audio-python) [![License](https://img.shields.io/github/license/fishaudio/fish-audio-python)](https://github.com/fishaudio/fish-audio-python/blob/main/LICENSE) -The official Python library for the Fish Audio API. +The official Python library for the Fish Audio API -## Notice: New API Available +**Documentation:** [Python SDK Guide](https://docs.fish.audio/developer-guide/sdk-guide/python/) | [API Reference](https://docs.fish.audio/api-reference/sdk/python/) -The SDK now includes a modern `fishaudio` API with improved ergonomics, better type safety, and enhanced features. - -For new projects, use the `fishaudio` module. For existing projects using the legacy API, see the [Legacy SDK section](#legacy-sdk) below - -## API Documentation - -For complete documentation and API reference, visit the [Python SDK Guide](https://docs.fish.audio/developer-guide/sdk-guide/python/) and [API Reference](https://docs.fish.audio/api-reference/sdk/python/). +> **Note:** If you're using the legacy `fish_audio_sdk` API, see the [migration guide](https://docs.fish.audio/archive/python-sdk-legacy/migration-guide) to upgrade. ## Installation -This package is available on PyPI: - ```bash pip install fish-audio-sdk + +# With audio playback utilities +pip install fish-audio-sdk[utils] ``` -You may install from source by running the following command in the repository root: +## Authentication + +Get your API key from [fish.audio/app/api-keys](https://fish.audio/app/api-keys): ```bash -python -m pip install . +export FISH_API_KEY=your_api_key_here ``` -## Usage - -The client will need to be configured with an API key, which you can obtain from [Fish Audio](https://fish.audio/app/api-keys). +Or provide directly: ```python from fishaudio import FishAudio -client = FishAudio() # Automatically reads from the FISH_API_KEY environment variable - -client = FishAudio(api_key="your-api-key") # Or provide the API key directly +client = FishAudio(api_key="your_api_key") ``` -The SDK provides [text-to-speech](#text-to-speech), [voice cloning](#instant-voice-cloning), [speech recognition](#speech-recognition-asr), and [voice management](#voice-management) capabilities. - -### Text-to-Speech - -Convert text to natural-sounding speech with support for multiple voices, formats, and real-time streaming. +## Quick Start -#### Basic +**Synchronous:** ```python from fishaudio import FishAudio -from fishaudio.utils import save, play +from fishaudio.utils import play, save client = FishAudio() -audio = client.tts.convert(text="Hello, world!") # Default voice and settings -play(audio) # Play audio directly +# Generate audio +audio = client.tts.convert(text="Hello, world!") -audio = client.tts.convert(text="Welcome to Fish Audio SDK!") -save(audio, "output.mp3") # You can also save to a file +# Play or save +play(audio) +save(audio, "output.mp3") +``` + +**Asynchronous:** + +```python +import asyncio +from fishaudio import AsyncFishAudio +from fishaudio.utils import play, save + +async def main(): + client = AsyncFishAudio() + audio = await client.tts.convert(text="Hello, world!") + play(audio) + save(audio, "output.mp3") + +asyncio.run(main()) ``` -#### With Reference Voice +## Core Features -Use a reference voice ID to ensure consistent voice characteristics across generations: +### Text-to-Speech + +**With custom voice:** ```python -# Use an existing voice by ID +# Use a specific voice by ID audio = client.tts.convert( - text="This will sound like the reference voice!", - reference_id="802e3bc2b27e49c2995d23ef70e6ac89" # Energetic Male + text="Custom voice", + reference_id="802e3bc2b27e49c2995d23ef70e6ac89" ) ``` -#### Instant Voice Cloning - -Immediately clone a voice from a short audio sample: +**With speed control:** ```python -# Clone a voice from audio sample -with open("reference.wav", "rb") as f: - audio = client.tts.convert( - text="This will sound like the reference voice!", - reference_audio=f.read(), - reference_text="Transcription of the reference audio" - ) +audio = client.tts.convert( + text="Speaking faster!", + speed=1.5 # 1.5x speed +) ``` -#### Streaming Audio Chunks +**Reusable configuration:** + +```python +from fishaudio.types import TTSConfig, Prosody -For processing audio chunks as they're generated: +config = TTSConfig( + prosody=Prosody(speed=1.2, volume=-5), + reference_id="933563129e564b19a115bedd57b7406a", + format="wav", + latency="balanced" +) + +# Reuse across generations +audio1 = client.tts.convert(text="First message", config=config) +audio2 = client.tts.convert(text="Second message", config=config) +``` + +**Chunk-by-chunk processing:** ```python -# Stream and process audio chunks -for chunk in client.tts.stream(text="Long text content..."): - # Process each chunk as it arrives +# Stream and process chunks as they arrive +for chunk in client.tts.stream(text="Long content..."): send_to_websocket(chunk) # Or collect all chunks audio = client.tts.stream(text="Hello!").collect() ``` -#### Real-time WebSocket Streaming +[Learn more](https://docs.fish.audio/developer-guide/sdk-guide/python/text-to-speech) -For low-latency bidirectional streaming where you send text chunks and receive audio in real-time: +### Speech-to-Text ```python -from fishaudio import FishAudio -from fishaudio.utils import play +# Transcribe audio +with open("audio.wav", "rb") as f: + result = client.asr.transcribe(audio=f.read(), language="en") -client = FishAudio() +print(result.text) + +# Access timestamped segments +for segment in result.segments: + print(f"[{segment.start:.2f}s - {segment.end:.2f}s] {segment.text}") +``` + +[Learn more](https://docs.fish.audio/developer-guide/sdk-guide/python/speech-to-text) + +### Real-time Streaming + +Stream dynamically generated text for conversational AI and live applications: + +**Synchronous:** -# Stream text chunks and receive audio in real-time +```python def text_chunks(): yield "Hello, " yield "this is " - yield "streaming audio!" + yield "streaming!" audio_stream = client.tts.stream_websocket(text_chunks(), latency="balanced") play(audio_stream) ``` -### Speech Recognition (ASR) - -To transcribe audio to text: +**Asynchronous:** ```python -from fishaudio import FishAudio - -client = FishAudio() +async def text_chunks(): + yield "Hello, " + yield "this is " + yield "streaming!" -# Transcribe audio to text -with open("audio.wav", "rb") as f: - result = client.asr.transcribe(audio=f.read()) - print(result.text) +audio_stream = await client.tts.stream_websocket(text_chunks(), latency="balanced") +play(audio_stream) ``` -### Voice Management +[Learn more](https://docs.fish.audio/developer-guide/sdk-guide/python/websocket) -Manage voice references and list available voices. +### Voice Cloning -```python -from fishaudio import FishAudio +**Instant cloning:** -client = FishAudio() - -# List available voices -voices = client.voices.list(language="en", tags="male") - -# Get a specific voice by ID -voice = client.voices.get(voice_id="802e3bc2b27e49c2995d23ef70e6ac89") +```python +from fishaudio.types import ReferenceAudio -# Create a custom voice -with open("voice_sample.wav", "rb") as f: - new_voice = client.voices.create( - title="My Custom Voice", - voices=[f.read()], - description="My cloned voice" +# Clone voice on-the-fly +with open("reference.wav", "rb") as f: + audio = client.tts.convert( + text="Cloned voice speaking", + references=[ReferenceAudio( + audio=f.read(), + text="Text spoken in reference" + )] ) ``` -### Async Usage - -You can also use the SDK in asynchronous applications: +**Persistent voice models:** ```python -import asyncio -from fishaudio import AsyncFishAudio - -async def main(): - client = AsyncFishAudio() - - audio = await client.tts.convert(text="Async text-to-speech!") - # Process audio... +# Create voice model for reuse +with open("voice_sample.wav", "rb") as f: + voice = client.voices.create( + title="My Voice", + voices=[f.read()], + description="Custom voice clone" + ) -asyncio.run(main()) +# Use the created model +audio = client.tts.convert( + text="Using my saved voice", + reference_id=voice.id +) ``` -### Account +[Learn more](https://docs.fish.audio/developer-guide/sdk-guide/python/voice-cloning) -Check your remaining API credits, usage, and account details: +## Resource Clients -```python -from fishaudio import FishAudio +| Resource | Description | Key Methods | +|----------|-------------|-------------| +| `client.tts` | Text-to-speech | `convert()`, `stream()`, `stream_websocket()` | +| `client.asr` | Speech recognition | `transcribe()` | +| `client.voices` | Voice management | `list()`, `get()`, `create()`, `update()`, `delete()` | +| `client.account` | Account info | `get_credits()`, `get_package()` | -client = FishAudio() -credits = client.account.get_credits() -print(f"Remaining credits: {credits.credit}") -``` - - -### Optional Dependencies +## Error Handling -For audio playback utilities to help with playing and saving audio files, install the `utils` extra: +```python +from fishaudio.exceptions import ( + AuthenticationError, + RateLimitError, + ValidationError, + FishAudioError +) -```bash -pip install fish-audio-sdk[utils] +try: + audio = client.tts.convert(text="Hello!") +except AuthenticationError: + print("Invalid API key") +except RateLimitError: + print("Rate limit exceeded") +except ValidationError as e: + print(f"Invalid request: {e}") +except FishAudioError as e: + print(f"API error: {e}") ``` -## Legacy SDK - -The legacy `fish_audio_sdk` module continues to be supported for existing projects: +## Resources -```python -from fish_audio_sdk import Session - -session = Session("your_api_key") -``` +- **Documentation:** [SDK Guide](https://docs.fish.audio/developer-guide/sdk-guide/python/) | [API Reference](https://docs.fish.audio/api-reference/sdk/python/) +- **Package:** [PyPI](https://pypi.org/project/fish-audio-sdk/) | [GitHub](https://github.com/fishaudio/fish-audio-python) +- **Legacy SDK:** [Documentation](https://docs.fish.audio/archive/python-sdk-legacy) | [Migration Guide](https://docs.fish.audio/archive/python-sdk-legacy/migration-guide) -For complete legacy SDK documentation, see the [Legacy API Documentation](https://docs.fish.audio/archive/python-sdk-legacy). +## License -We recommend migrating to the new `fishaudio` module - see our [Migration Guide](https://docs.fish.audio/archive/python-sdk-legacy/migration-guide) for assistance. +This project is licensed under the Apache-2.0 License - see the [LICENSE](LICENSE) file for details. \ No newline at end of file From 38a7fbd886219e0405b4991cedad1d49b50d6fb8 Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 20:10:16 -0600 Subject: [PATCH 14/16] feat: update text-to-speech implementation to use streaming method --- src/fishaudio/core/iterators.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/fishaudio/core/iterators.py b/src/fishaudio/core/iterators.py index 971618a..fbd5df8 100644 --- a/src/fishaudio/core/iterators.py +++ b/src/fishaudio/core/iterators.py @@ -16,10 +16,10 @@ class AudioStream: client = FishAudio(api_key="...") # Collect all audio at once - audio = client.tts.convert(text="Hello!").collect() + audio = client.tts.stream(text="Hello!").collect() # Or stream chunks manually - for chunk in client.tts.convert(text="Hello!"): + for chunk in client.tts.stream(text="Hello!"): process_chunk(chunk) ``` """ @@ -47,7 +47,7 @@ def collect(self) -> bytes: Examples: ```python - audio = client.tts.convert(text="Hello!").collect() + audio = client.tts.stream(text="Hello!").collect() with open("output.mp3", "wb") as f: f.write(audio) ``` @@ -71,10 +71,11 @@ class AsyncAudioStream: client = AsyncFishAudio(api_key="...") # Collect all audio at once - audio = await client.tts.convert(text="Hello!").collect() + stream = await client.tts.stream(text="Hello!") + audio = await stream.collect() # Or stream chunks manually - async for chunk in client.tts.convert(text="Hello!"): + async for chunk in await client.tts.stream(text="Hello!"): await process_chunk(chunk) ``` """ @@ -102,7 +103,8 @@ async def collect(self) -> bytes: Examples: ```python - audio = await client.tts.convert(text="Hello!").collect() + stream = await client.tts.stream(text="Hello!") + audio = await stream.collect() with open("output.mp3", "wb") as f: f.write(audio) ``` From 22b37339a4caf5a80ffaf8bf3902828873ea4e3d Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 20:25:43 -0600 Subject: [PATCH 15/16] fix: correct async streaming method calls in text-to-speech implementation --- src/fishaudio/resources/tts.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/fishaudio/resources/tts.py b/src/fishaudio/resources/tts.py index 578b676..bd3ceec 100644 --- a/src/fishaudio/resources/tts.py +++ b/src/fishaudio/resources/tts.py @@ -382,11 +382,12 @@ async def stream( client = AsyncFishAudio(api_key="...") # Stream and process chunks - async for chunk in client.tts.stream(text="Hello world"): + async for chunk in await client.tts.stream(text="Hello world"): await process_audio_chunk(chunk) # Or collect all at once - audio = await client.tts.stream(text="Hello world").collect() + stream = await client.tts.stream(text="Hello world") + audio = await stream.collect() ``` """ # Build request payload from config From 30846d44fb054eb1749ba37a268825f9637772c2 Mon Sep 17 00:00:00 2001 From: James Ding Date: Thu, 13 Nov 2025 20:42:22 -0600 Subject: [PATCH 16/16] chore: update Python version to 3.9 and add future annotations for type hints --- .github/workflows/python.yml | 2 +- tests/integration/conftest.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 79de8c8..f6b6ca4 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -72,7 +72,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.x" + python-version: "3.9" - name: Install uv uses: astral-sh/setup-uv@v4 diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7cc0ef1..2d43b32 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,5 +1,7 @@ """Fixtures for integration tests.""" +from __future__ import annotations + import os from pathlib import Path