faiss_rag_enterprise/llama_index/tts/bark.py

85 lines
2.5 KiB
Python

"""Bark TTS module."""
import os
import tempfile
from typing import Any, Optional
import numpy as np
from llama_index.tts.base import BaseTTS
# text to be chunked into chunks of 10 words
# to avoid hallicunation for bark
DEFAULT_CHUNK_SIZE = 10
class BarkTTS(BaseTTS):
"""Bark TTS.
Args:
text_temp: generation temperature (1.0 more diverse, \
0.0 more conservative)
waveform_temp: generation temperature (1.0 more diverse, \
0.0 more conservative)
lang_speaker_voice: language speaker voice for audio cloning.
"""
def __init__(
self,
text_temp: float = 0.7,
waveform_temp: float = 0.7,
lang_speaker_voice: Optional[str] = None,
) -> None:
"""Init params."""
super().__init__()
self.text_temp = text_temp
self.waveform_temp = waveform_temp
self.lang_speaker_voice = lang_speaker_voice
def generate_audio(self, text: str) -> Any:
"""Generate audio from text.
NOTE: return type is Any, but it should be any object that can be fed
as `data` into IPython.display.Audio(). This includes numpy array, list,
unicode, str or bytes
Args:
text: text to be turned into audio.
"""
import_err_msg = "`bark` package not found, \
please run `pip install git+https://github.com/suno-ai/bark.git`"
try:
import bark
except ImportError:
raise ImportError(import_err_msg)
words = text.split()
chunks = [
words[i : i + DEFAULT_CHUNK_SIZE]
for i in range(0, len(words), DEFAULT_CHUNK_SIZE)
]
chunks = [" ".join(chunk) for chunk in chunks] # type: ignore
full_generation = None
history_prompt = self.lang_speaker_voice
audio_chunks = []
for chunk in chunks:
with tempfile.TemporaryDirectory() as d:
if full_generation:
f = os.path.join(d, "history_prompt.npz")
bark.save_as_prompt(f, full_generation)
history_prompt = f
full_generation, audio_array = bark.generate_audio(
chunk,
history_prompt=history_prompt,
text_temp=self.text_temp,
waveform_temp=self.waveform_temp,
output_full=True,
)
audio_chunks.append(audio_array)
return np.concatenate(audio_chunks)