faiss_rag_enterprise/llama_index/tts/bark.py

"""Bark TTS module."""

import os
import tempfile
from typing import Any, Optional

import numpy as np

from llama_index.tts.base import BaseTTS

# text to be chunked into chunks of 10 words
# to avoid hallicunation for bark
DEFAULT_CHUNK_SIZE = 10


class BarkTTS(BaseTTS):
    """Bark TTS.

    Args:
        text_temp: generation temperature (1.0 more diverse, \
            0.0 more conservative)
        waveform_temp: generation temperature (1.0 more diverse, \
            0.0 more conservative)
        lang_speaker_voice: language speaker voice for audio cloning.

    """

    def __init__(
        self,
        text_temp: float = 0.7,
        waveform_temp: float = 0.7,
        lang_speaker_voice: Optional[str] = None,
    ) -> None:
        """Init params."""
        super().__init__()

        self.text_temp = text_temp
        self.waveform_temp = waveform_temp
        self.lang_speaker_voice = lang_speaker_voice

    def generate_audio(self, text: str) -> Any:
        """Generate audio from text.

        NOTE: return type is Any, but it should be any object that can be fed
        as `data` into IPython.display.Audio(). This includes numpy array, list,
        unicode, str or bytes

        Args:
            text: text to be turned into audio.
        """
        import_err_msg = "`bark` package not found, \
            please run `pip install git+https://github.com/suno-ai/bark.git`"
        try:
            import bark
        except ImportError:
            raise ImportError(import_err_msg)

        words = text.split()
        chunks = [
            words[i : i + DEFAULT_CHUNK_SIZE]
            for i in range(0, len(words), DEFAULT_CHUNK_SIZE)
        ]
        chunks = [" ".join(chunk) for chunk in chunks]  # type: ignore

        full_generation = None
        history_prompt = self.lang_speaker_voice
        audio_chunks = []

        for chunk in chunks:
            with tempfile.TemporaryDirectory() as d:
                if full_generation:
                    f = os.path.join(d, "history_prompt.npz")
                    bark.save_as_prompt(f, full_generation)
                    history_prompt = f
                full_generation, audio_array = bark.generate_audio(
                    chunk,
                    history_prompt=history_prompt,
                    text_temp=self.text_temp,
                    waveform_temp=self.waveform_temp,
                    output_full=True,
                )
                audio_chunks.append(audio_array)

        return np.concatenate(audio_chunks)