integrate eleven labs

2023-03-20 20:37:39 -07:00 · 2023-03-20 20:37:39 -07:00 · 97c7a4f8ac
commit 97c7a4f8ac
parent a01e3b2fb8
3 changed files with 32 additions and 5 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vocode"
-version = "0.1.48"
+version = "0.1.49"
 description = "The all-in-one voice SDK"
 authors = ["Ajay Raj <ajay@vocode.dev>"]
 license = "MIT License"
--- a/simple_turn_based_conversation.py
+++ b/simple_turn_based_conversation.py
@ -4,6 +4,7 @@ import os
 from vocode.helpers import create_microphone_input_and_speaker_output
 from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent
 from vocode.turn_based.synthesizer.azure_synthesizer import AzureSynthesizer
+from vocode.turn_based.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
 from vocode.turn_based.transcriber.whisper_transcriber import WhisperTranscriber
 from vocode.turn_based.turn_based_conversation import TurnBasedConversation

@ -13,6 +14,9 @@ logger.setLevel(logging.INFO)

 load_dotenv()

+# See https://api.elevenlabs.io/v1/voices
+ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
+
 if __name__ == "__main__":
    microphone_input, speaker_output = create_microphone_input_and_speaker_output(
        streaming=False, use_default_devices=False
@ -27,10 +31,9 @@ if __name__ == "__main__":
            initial_message="Hello!",
            api_key=os.getenv("OPENAI_API_KEY"),
        ),
-        synthesizer=AzureSynthesizer(
-            sampling_rate=speaker_output.sampling_rate,
-            api_key=os.getenv("AZURE_SPEECH_KEY"),
-            region=os.getenv("AZURE_SPEECH_REGION"),
+        synthesizer=ElevenLabsSynthesizer(
+            voice_id=ADAM_VOICE_ID,
+            api_key=os.getenv("ELEVEN_LABS_API_KEY"),
        ),
        logger=logger,
    )
--- a/vocode/turn_based/synthesizer/eleven_labs_synthesizer.py
+++ b/vocode/turn_based/synthesizer/eleven_labs_synthesizer.py
@ -0,0 +1,24 @@
+import io
+import os
+from typing import Optional
+from pydub import AudioSegment
+import requests
+from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer
+
+ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
+
+
+class ElevenLabsSynthesizer(BaseSynthesizer):
+    def __init__(self, voice_id: str, api_key: Optional[str] = None):
+        self.voice_id = voice_id
+        self.api_key = os.getenv("ELEVEN_LABS_API_KEY", api_key)
+
+    def synthesize(self, text: str) -> AudioSegment:
+        url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}"
+        headers = {"xi-api-key": self.api_key, "voice_id": self.voice_id}
+        body = {
+            "text": text,
+        }
+        response = requests.post(url, headers=headers, json=body)
+        assert response.ok, response.text
+        return AudioSegment.from_mp3(io.BytesIO(response.content))