From 1dc7bc74c3299f45d0ec2b5428d6ab8bf0ea7123 Mon Sep 17 00:00:00 2001 From: Ajay Raj Date: Tue, 28 Mar 2023 10:20:36 -0700 Subject: [PATCH] remove pyq goodbye model and rime synthesizer and fix environment loading --- examples/hosted_streaming_conversation.py | 12 ++- examples/streaming_conversation.py | 28 +++---- examples/telephony_app.py | 46 +++++------ examples/turn_based_conversation.py | 8 +- vocode/__init__.py | 18 ++++- vocode/streaming/agent/chat_gpt_agent.py | 10 +-- vocode/streaming/agent/llm_agent.py | 3 - vocode/streaming/factory.py | 8 +- .../hosted_streaming_conversation.py | 4 - vocode/streaming/streaming_conversation.py | 26 +++---- .../synthesizer/azure_synthesizer.py | 7 +- .../synthesizer/eleven_labs_synthesizer.py | 7 +- .../synthesizer/google_synthesizer.py | 3 - .../streaming/synthesizer/rime_synthesizer.py | 78 ------------------- .../config_manager/base_config_manager.py | 1 - .../streaming/telephony/conversation/call.py | 49 ++++++------ .../telephony/conversation/outbound_call.py | 9 ++- .../telephony/conversation/zoom_dial_in.py | 2 +- vocode/streaming/telephony/server/base.py | 5 +- vocode/streaming/telephony/twilio.py | 4 - .../transcriber/assembly_ai_transcriber.py | 8 +- .../streaming/transcriber/base_transcriber.py | 3 - .../transcriber/deepgram_transcriber.py | 8 +- vocode/streaming/utils/goodbye_model.py | 63 +++++---------- vocode/turn_based/agent/chat_gpt_agent.py | 4 +- .../synthesizer/azure_synthesizer.py | 6 +- .../synthesizer/eleven_labs_synthesizer.py | 4 +- .../transcriber/whisper_transcriber.py | 4 +- 28 files changed, 143 insertions(+), 285 deletions(-) delete mode 100644 vocode/streaming/synthesizer/rime_synthesizer.py diff --git a/examples/hosted_streaming_conversation.py b/examples/hosted_streaming_conversation.py index 59152f0..4a119bc 100644 --- a/examples/hosted_streaming_conversation.py +++ b/examples/hosted_streaming_conversation.py @@ -2,14 +2,15 @@ import asyncio import logging import signal from dotenv import load_dotenv -import os + +load_dotenv() + from vocode.streaming.hosted_streaming_conversation import HostedStreamingConversation from vocode.streaming.streaming_conversation import StreamingConversation from vocode.helpers import create_microphone_input_and_speaker_output from vocode.streaming.models.transcriber import ( DeepgramTranscriberConfig, PunctuationEndpointingConfig, - GoogleTranscriberConfig, ) from vocode.streaming.models.agent import ( ChatGPTAgentConfig, @@ -23,10 +24,6 @@ from vocode.streaming.models.agent import ( ) from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.synthesizer import AzureSynthesizerConfig -import vocode - -load_dotenv() -vocode.api_key = os.getenv("VOCODE_API_KEY") logging.basicConfig() logging.root.setLevel(logging.INFO) @@ -41,7 +38,8 @@ if __name__ == "__main__": input_device=microphone_input, output_device=speaker_output, transcriber_config=DeepgramTranscriberConfig.from_input_device( - microphone_input + microphone_input, + endpointing_config=PunctuationEndpointingConfig(), ), agent_config=ChatGPTAgentConfig( initial_message=BaseMessage(text="Hello!"), diff --git a/examples/streaming_conversation.py b/examples/streaming_conversation.py index 60d2d90..c0be5be 100644 --- a/examples/streaming_conversation.py +++ b/examples/streaming_conversation.py @@ -2,7 +2,9 @@ import asyncio import logging import signal from dotenv import load_dotenv -import os + +load_dotenv() + from vocode.streaming.agent.chat_gpt_agent import ChatGPTAgent from vocode.streaming.streaming_conversation import StreamingConversation from vocode.helpers import create_microphone_input_and_speaker_output @@ -31,8 +33,6 @@ import vocode from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber -load_dotenv() -vocode.api_key = os.getenv("VOCODE_API_KEY") logging.basicConfig() logger = logging.getLogger(__name__) @@ -46,23 +46,17 @@ async def main(): conversation = StreamingConversation( output_device=speaker_output, - transcriber=DeepgramTranscriber( - DeepgramTranscriberConfig.from_input_device( - microphone_input, endpointing_config=PunctuationEndpointingConfig() - ) + transcriber=DeepgramTranscriberConfig.from_input_device( + microphone_input, endpointing_config=PunctuationEndpointingConfig() ), - agent=ChatGPTAgent( - ChatGPTAgentConfig( - initial_message=BaseMessage(text="What up"), - prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like + agent=ChatGPTAgentConfig( + initial_message=BaseMessage(text="What up"), + prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""", - generate_responses=True, - cut_off_response=CutOffResponse(), - ) - ), - synthesizer=AzureSynthesizer( - AzureSynthesizerConfig.from_output_device(speaker_output), + generate_responses=True, + cut_off_response=CutOffResponse(), ), + synthesizer=AzureSynthesizerConfig.from_output_device(speaker_output), logger=logger, ) await conversation.start() diff --git a/examples/telephony_app.py b/examples/telephony_app.py index 8162cdb..86e81c2 100644 --- a/examples/telephony_app.py +++ b/examples/telephony_app.py @@ -1,7 +1,7 @@ import logging from fastapi import FastAPI -import os from dotenv import load_dotenv +from vocode import getenv load_dotenv() @@ -34,13 +34,12 @@ telephony_server = TelephonyServer( url="/inbound_call", agent_config=ChatGPTAgentConfig( initial_message=BaseMessage(text="What up"), - prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like -hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""", + prompt_preamble="Have a pleasant conversation about life", generate_responses=True, ), twilio_config=TwilioConfig( - account_sid=os.getenv("TWILIO_ACCOUNT_SID"), - auth_token=os.getenv("TWILIO_AUTH_TOKEN"), + account_sid=getenv("TWILIO_ACCOUNT_SID"), + auth_token=getenv("TWILIO_AUTH_TOKEN"), ), ) ], @@ -49,21 +48,22 @@ hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, app.include_router(telephony_server.get_router()) -# outbound_call = OutboundCall( -# base_url=BASE_URL, -# to_phone="+14088926228", -# from_phone="+14086600744", -# config_manager=config_manager, -# agent_config=ChatGPTAgentConfig( -# initial_message=BaseMessage(text="What up"), -# prompt_preamble="""You are a helpful gen Z AI assistant. You use slang like um, but, and like a LOT. All of your responses are 10 words or less. Be super chill, use slang like -# hella, down, fire, totally, but like, slay, vibing, queen, go off, bet, sus, simp, cap, big yikes, main character, dank""", -# generate_responses=True, -# ), -# twilio_config=TwilioConfig( -# account_sid=os.getenv("TWILIO_ACCOUNT_SID"), -# auth_token=os.getenv("TWILIO_AUTH_TOKEN"), -# ), -# logger=logger, -# ) -# outbound_call.start() +outbound_call = OutboundCall( + base_url=BASE_URL, + to_phone="+14088926228", + from_phone="+14086600744", + config_manager=config_manager, + agent_config=ChatGPTAgentConfig( + initial_message=BaseMessage(text="What up"), + prompt_preamble="Have a pleasant conversation about life", + generate_responses=True, + ), + twilio_config=TwilioConfig( + account_sid=getenv("TWILIO_ACCOUNT_SID"), + auth_token=getenv("TWILIO_AUTH_TOKEN"), + ), + logger=logger, +) + +input("Press enter to start call...") +outbound_call.start() diff --git a/examples/turn_based_conversation.py b/examples/turn_based_conversation.py index 9681aaf..4bd5fce 100644 --- a/examples/turn_based_conversation.py +++ b/examples/turn_based_conversation.py @@ -1,6 +1,6 @@ import logging from dotenv import load_dotenv -import os +from vocode import getenv from vocode.helpers import create_microphone_input_and_speaker_output from vocode.turn_based.agent.chat_gpt_agent import ChatGPTAgent from vocode.turn_based.synthesizer.azure_synthesizer import AzureSynthesizer @@ -25,15 +25,15 @@ if __name__ == "__main__": conversation = TurnBasedConversation( input_device=microphone_input, output_device=speaker_output, - transcriber=WhisperTranscriber(api_key=os.getenv("OPENAI_API_KEY")), + transcriber=WhisperTranscriber(api_key=getenv("OPENAI_API_KEY")), agent=ChatGPTAgent( system_prompt="The AI is having a pleasant conversation about life", initial_message="Hello!", - api_key=os.getenv("OPENAI_API_KEY"), + api_key=getenv("OPENAI_API_KEY"), ), synthesizer=ElevenLabsSynthesizer( voice_id=ADAM_VOICE_ID, - api_key=os.getenv("ELEVEN_LABS_API_KEY"), + api_key=getenv("ELEVEN_LABS_API_KEY"), ), logger=logger, ) diff --git a/vocode/__init__.py b/vocode/__init__.py index c5da2cc..b2c8af7 100644 --- a/vocode/__init__.py +++ b/vocode/__init__.py @@ -1,7 +1,17 @@ import os -from dotenv import load_dotenv -load_dotenv() -api_key = os.getenv("VOCODE_API_KEY") -base_url = os.getenv("VOCODE_BASE_URL", "api.vocode.dev") +environment = {} + + +def setenv(**kwargs): + for key, value in kwargs.items(): + environment[key] = value + + +def getenv(key, default=None): + return environment.get(key) or os.getenv(key, default) + + +api_key = getenv("VOCODE_API_KEY") +base_url = getenv("VOCODE_BASE_URL", "api.vocode.dev") diff --git a/vocode/streaming/agent/chat_gpt_agent.py b/vocode/streaming/agent/chat_gpt_agent.py index 0a240e8..623350b 100644 --- a/vocode/streaming/agent/chat_gpt_agent.py +++ b/vocode/streaming/agent/chat_gpt_agent.py @@ -1,4 +1,3 @@ -import os import random import time from langchain.prompts import ( @@ -16,23 +15,20 @@ import openai import json from typing import Generator, Optional -from dotenv import load_dotenv from typing import Generator import logging +from vocode import getenv from vocode.streaming.agent.base_agent import BaseAgent from vocode.streaming.models.agent import ChatGPTAgentConfig from vocode.streaming.utils.sse_client import SSEClient from vocode.streaming.agent.utils import stream_llm_response -load_dotenv() - -openai.api_key = os.environ.get("OPENAI_API_KEY") - class ChatGPTAgent(BaseAgent): def __init__(self, agent_config: ChatGPTAgentConfig, logger: logging.Logger = None): super().__init__(agent_config) + openai.api_key = getenv("OPENAI_API_KEY") self.agent_config = agent_config self.logger = logger or logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) @@ -112,7 +108,7 @@ class ChatGPTAgent(BaseAgent): "https://api.openai.com/v1/chat/completions", headers={ "Content-Type": "application/json", - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "Authorization": f"Bearer {getenv('OPENAI_API_KEY')}", }, json={ "model": self.agent_config.model_name, diff --git a/vocode/streaming/agent/llm_agent.py b/vocode/streaming/agent/llm_agent.py index e605507..9810239 100644 --- a/vocode/streaming/agent/llm_agent.py +++ b/vocode/streaming/agent/llm_agent.py @@ -1,7 +1,6 @@ import re from typing import Optional -from dotenv import load_dotenv from langchain import OpenAI from langchain.llms import OpenAIChat from typing import Generator @@ -11,8 +10,6 @@ from vocode.streaming.agent.base_agent import BaseAgent from vocode.streaming.agent.utils import stream_llm_response from vocode.streaming.models.agent import LLMAgentConfig -load_dotenv() - class LLMAgent(BaseAgent): SENTENCE_ENDINGS = [".", "!", "?"] diff --git a/vocode/streaming/factory.py b/vocode/streaming/factory.py index 063c46b..9838b7d 100644 --- a/vocode/streaming/factory.py +++ b/vocode/streaming/factory.py @@ -10,7 +10,6 @@ from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer -from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber from vocode.streaming.transcriber.base_transcriber import BaseTranscriber from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber @@ -48,11 +47,6 @@ def create_synthesizer(synthesizer_config: SynthesizerConfig) -> BaseSynthesizer elif synthesizer_config.type == SynthesizerType.AZURE: return AzureSynthesizer(synthesizer_config) elif synthesizer_config.type == SynthesizerType.ELEVEN_LABS: - kwargs = {} - if synthesizer_config.voice_id: - kwargs["voice_id"] = synthesizer_config.voice_id - return ElevenLabsSynthesizer(synthesizer_config, **kwargs) - elif synthesizer_config.type == SynthesizerType.RIME: - return RimeSynthesizer(synthesizer_config) + return ElevenLabsSynthesizer(synthesizer_config) else: raise Exception("Invalid synthesizer config") diff --git a/vocode/streaming/hosted_streaming_conversation.py b/vocode/streaming/hosted_streaming_conversation.py index 657e40d..ccdc78f 100644 --- a/vocode/streaming/hosted_streaming_conversation.py +++ b/vocode/streaming/hosted_streaming_conversation.py @@ -2,8 +2,6 @@ import websockets from websockets.exceptions import ConnectionClosedOK from websockets.client import WebSocketClientProtocol import asyncio -from dotenv import load_dotenv -import os import logging import threading import queue @@ -22,8 +20,6 @@ from vocode.streaming.models.websocket import ( StopMessage, ) -load_dotenv() - class HostedStreamingConversation: def __init__( diff --git a/vocode/streaming/streaming_conversation.py b/vocode/streaming/streaming_conversation.py index 89ad708..ce08a21 100644 --- a/vocode/streaming/streaming_conversation.py +++ b/vocode/streaming/streaming_conversation.py @@ -8,15 +8,18 @@ import time import secrets import random -from dotenv import load_dotenv from vocode.streaming.agent.bot_sentiment_analyser import ( BotSentiment, BotSentimentAnalyser, ) from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent +from vocode.streaming.factory import ( + create_agent, + create_synthesizer, + create_transcriber, +) from vocode.streaming.models.message import BaseMessage from vocode.streaming.output_device.base_output_device import BaseOutputDevice -from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber from vocode.streaming.utils.goodbye_model import GoodbyeModel from vocode.streaming.utils.transcript import Transcript @@ -48,9 +51,6 @@ from vocode.streaming.synthesizer.base_synthesizer import ( SynthesisResult, FillerAudio, ) -from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer -from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer -from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer from vocode.streaming.utils import ( create_conversation_id, create_loop_in_thread, @@ -60,19 +60,15 @@ from vocode.streaming.transcriber.base_transcriber import ( Transcription, BaseTranscriber, ) -from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber -from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber - -load_dotenv() class StreamingConversation: def __init__( self, output_device: BaseOutputDevice, - transcriber: BaseTranscriber, - agent: BaseAgent, - synthesizer: BaseSynthesizer, + transcriber_config: TranscriberConfig, + agent_config: AgentConfig, + synthesizer_config: SynthesizerConfig, conversation_id: str = None, per_chunk_allowance_seconds: int = PER_CHUNK_ALLOWANCE_SECONDS, logger: Optional[logging.Logger] = None, @@ -80,11 +76,11 @@ class StreamingConversation: self.id = conversation_id or create_conversation_id() self.logger = logger or logging.getLogger(__name__) self.output_device = output_device - self.transcriber = transcriber + self.transcriber = create_transcriber(transcriber_config) self.transcriber.set_on_response(self.on_transcription_response) self.transcriber_task = None - self.agent = agent - self.synthesizer = synthesizer + self.agent = create_agent(agent_config) + self.synthesizer = create_synthesizer(synthesizer_config) self.synthesizer_event_loop = asyncio.new_event_loop() self.synthesizer_thread = threading.Thread( name="synthesizer", diff --git a/vocode/streaming/synthesizer/azure_synthesizer.py b/vocode/streaming/synthesizer/azure_synthesizer.py index a8f603f..5237479 100644 --- a/vocode/streaming/synthesizer/azure_synthesizer.py +++ b/vocode/streaming/synthesizer/azure_synthesizer.py @@ -4,7 +4,7 @@ import re from typing import Any, Optional from xml.etree import ElementTree import azure.cognitiveservices.speech as speechsdk -from dotenv import load_dotenv +from vocode import getenv from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment from vocode.streaming.models.message import BaseMessage, SSMLMessage @@ -20,7 +20,6 @@ from vocode.streaming.synthesizer.base_synthesizer import ( from vocode.streaming.models.synthesizer import AzureSynthesizerConfig from vocode.streaming.models.audio_encoding import AudioEncoding -load_dotenv() NAMESPACES = { "mstts": "https://www.w3.org/2001/mstts", @@ -59,8 +58,8 @@ class AzureSynthesizer(BaseSynthesizer): self.synthesizer_config = synthesizer_config # Instantiates a client speech_config = speechsdk.SpeechConfig( - subscription=os.environ.get("AZURE_SPEECH_KEY"), - region=os.environ.get("AZURE_SPEECH_REGION"), + subscription=getenv("AZURE_SPEECH_KEY"), + region=getenv("AZURE_SPEECH_REGION"), ) if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16: if self.synthesizer_config.sampling_rate == 44100: diff --git a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py index 7663c58..610232a 100644 --- a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py +++ b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py @@ -1,7 +1,6 @@ from typing import Any, Optional -import os -from dotenv import load_dotenv import requests +from vocode import getenv from vocode.streaming.synthesizer.base_synthesizer import ( BaseSynthesizer, @@ -11,9 +10,7 @@ from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment from vocode.streaming.models.message import BaseMessage -load_dotenv() -ELEVEN_LABS_API_KEY = os.environ.get("ELEVEN_LABS_API_KEY") ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/" ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB" OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C" @@ -22,7 +19,7 @@ OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C" class ElevenLabsSynthesizer(BaseSynthesizer): def __init__(self, config: ElevenLabsSynthesizerConfig): super().__init__(config) - self.api_key = config.api_key + self.api_key = getenv("ELEVEN_LABS_API_KEY") self.voice_id = config.voice_id or ADAM_VOICE_ID self.words_per_minute = 150 diff --git a/vocode/streaming/synthesizer/google_synthesizer.py b/vocode/streaming/synthesizer/google_synthesizer.py index 6af1f41..cecde94 100644 --- a/vocode/streaming/synthesizer/google_synthesizer.py +++ b/vocode/streaming/synthesizer/google_synthesizer.py @@ -2,7 +2,6 @@ import io import wave from typing import Any, Optional -from dotenv import load_dotenv from google.cloud import texttospeech_v1beta1 as tts from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment @@ -16,8 +15,6 @@ from vocode.streaming.models.synthesizer import GoogleSynthesizerConfig from vocode.streaming.models.audio_encoding import AudioEncoding from vocode.streaming.utils import convert_wav -load_dotenv() - class GoogleSynthesizer(BaseSynthesizer): OFFSET_SECONDS = 0.5 diff --git a/vocode/streaming/synthesizer/rime_synthesizer.py b/vocode/streaming/synthesizer/rime_synthesizer.py deleted file mode 100644 index 74d2f0b..0000000 --- a/vocode/streaming/synthesizer/rime_synthesizer.py +++ /dev/null @@ -1,78 +0,0 @@ -import audioop -import base64 -from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment -from vocode.streaming.models.audio_encoding import AudioEncoding - -from vocode.streaming.models.message import BaseMessage - -from .base_synthesizer import BaseSynthesizer, SynthesisResult, encode_as_wav -from typing import Any, Optional -import os -import io -import wave -from dotenv import load_dotenv -import requests - -from ..utils import convert_linear_audio, convert_wav -from ..models.synthesizer import ElevenLabsSynthesizerConfig, RimeSynthesizerConfig - -load_dotenv() - -RIME_API_KEY = os.getenv("RIME_API_KEY") -RIME_BASE_URL = os.getenv("RIME_BASE_URL") - - -class RimeSynthesizer(BaseSynthesizer): - def __init__(self, config: RimeSynthesizerConfig): - super().__init__(config) - self.speaker = config.speaker - - def create_speech( - self, - message: BaseMessage, - chunk_size: int, - bot_sentiment: Optional[BotSentiment] = None, - ) -> SynthesisResult: - url = RIME_BASE_URL - headers = {"Authorization": f"Bearer {RIME_API_KEY}"} - body = {"inputs": {"text": message.text, "speaker": self.speaker}} - response = requests.post(url, headers=headers, json=body) - - def chunk_generator(audio, chunk_transform=lambda x: x): - for i in range(0, len(audio), chunk_size): - chunk = audio[i : i + chunk_size] - yield SynthesisResult.ChunkResult( - chunk_transform(chunk), len(chunk) != chunk_size - ) - - assert response.ok, response.text - data = response.json().get("data") - assert data - - audio_file = io.BytesIO(base64.b64decode(data)) - - if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16: - output_bytes = convert_wav( - audio_file, - output_sample_rate=self.synthesizer_config.sampling_rate, - output_encoding=AudioEncoding.LINEAR16, - ) - elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW: - output_bytes = convert_wav( - audio_file, - output_sample_rate=self.synthesizer_config.sampling_rate, - output_encoding=AudioEncoding.MULAW, - ) - - if self.synthesizer_config.should_encode_as_wav: - output_generator = chunk_generator( - output_bytes, chunk_transform=encode_as_wav - ) - else: - output_generator = chunk_generator(output_bytes) - return SynthesisResult( - output_generator, - lambda seconds: self.get_message_cutoff_from_total_response_length( - message, seconds, len(output_bytes) - ), - ) diff --git a/vocode/streaming/telephony/config_manager/base_config_manager.py b/vocode/streaming/telephony/config_manager/base_config_manager.py index e74025e..22f3702 100644 --- a/vocode/streaming/telephony/config_manager/base_config_manager.py +++ b/vocode/streaming/telephony/config_manager/base_config_manager.py @@ -1,5 +1,4 @@ import logging -import os from typing import Optional from redis import Redis diff --git a/vocode/streaming/telephony/conversation/call.py b/vocode/streaming/telephony/conversation/call.py index f2fae8f..8426e0f 100644 --- a/vocode/streaming/telephony/conversation/call.py +++ b/vocode/streaming/telephony/conversation/call.py @@ -4,6 +4,7 @@ from enum import Enum import json import logging from typing import Optional +from vocode import getenv from vocode.streaming.agent.base_agent import BaseAgent from vocode.streaming.factory import ( create_agent, @@ -42,38 +43,36 @@ class Call(StreamingConversation): self, base_url: str, config_manager: BaseConfigManager, - agent: BaseAgent, - twilio_config: TwilioConfig, - transcriber: Optional[BaseTranscriber] = None, - synthesizer: Optional[BaseSynthesizer] = None, - twilio_sid=None, + agent_config: BaseAgent, + transcriber_config: Optional[BaseTranscriber] = None, + synthesizer_config: Optional[BaseSynthesizer] = None, + twilio_config: Optional[TwilioConfig] = None, + twilio_sid: Optional[str] = None, conversation_id: Optional[str] = None, logger: Optional[logging.Logger] = None, ): self.base_url = base_url self.config_manager = config_manager self.output_device = TwilioOutputDevice() - self.twilio_config = twilio_config + self.twilio_config = twilio_config or TwilioConfig( + account_sid=getenv("TWILIO_ACCOUNT_SID"), + auth_token=getenv("TWILIO_AUTH_TOKEN"), + ) self.twilio_client = create_twilio_client(twilio_config) super().__init__( self.output_device, - transcriber - or DeepgramTranscriber( - DeepgramTranscriberConfig( - sampling_rate=8000, - audio_encoding=AudioEncoding.MULAW, - chunk_size=self.CHUNK_SIZE, - model="voicemail", - endpointing_config=PunctuationEndpointingConfig(), - ), - logger=logger, + transcriber_config + or DeepgramTranscriberConfig( + sampling_rate=8000, + audio_encoding=AudioEncoding.MULAW, + chunk_size=self.CHUNK_SIZE, + model="voicemail", + endpointing_config=PunctuationEndpointingConfig(), ), - agent, - synthesizer - or AzureSynthesizer( - AzureSynthesizerConfig( - sampling_rate=8000, audio_encoding=AudioEncoding.MULAW - ) + agent_config, + synthesizer_config + or AzureSynthesizerConfig( + sampling_rate=8000, audio_encoding=AudioEncoding.MULAW ), conversation_id=conversation_id, per_chunk_allowance_seconds=0.01, @@ -94,9 +93,9 @@ class Call(StreamingConversation): base_url=base_url, logger=logger, config_manager=config_manager, - agent=create_agent(call_config.agent_config), - transcriber=create_transcriber(call_config.transcriber_config), - synthesizer=create_synthesizer(call_config.synthesizer_config), + agent_config=call_config.agent_config, + transcriber_config=call_config.transcriber_config, + synthesizer_config=call_config.synthesizer_config, twilio_config=call_config.twilio_config, twilio_sid=call_config.twilio_sid, conversation_id=conversation_id, diff --git a/vocode/streaming/telephony/conversation/outbound_call.py b/vocode/streaming/telephony/conversation/outbound_call.py index 7d24ad3..557fa30 100644 --- a/vocode/streaming/telephony/conversation/outbound_call.py +++ b/vocode/streaming/telephony/conversation/outbound_call.py @@ -1,6 +1,6 @@ import logging from typing import Optional -from twilio.rest import Client +from vocode import getenv from vocode.streaming.models.agent import AgentConfig from vocode.streaming.models.synthesizer import ( @@ -33,7 +33,7 @@ class OutboundCall: from_phone: str, config_manager: BaseConfigManager, agent_config: AgentConfig, - twilio_config: TwilioConfig, + twilio_config: Optional[TwilioConfig] = None, transcriber_config: Optional[TranscriberConfig] = None, synthesizer_config: Optional[SynthesizerConfig] = None, conversation_id: Optional[str] = None, @@ -56,7 +56,10 @@ class OutboundCall: ) self.conversation_id = conversation_id or create_conversation_id() self.logger = logger - self.twilio_config = twilio_config + self.twilio_config = twilio_config or TwilioConfig( + account_sid=getenv("TWILIO_ACCOUNT_SID"), + auth_token=getenv("TWILIO_AUTH_TOKEN"), + ) self.twilio_client = create_twilio_client(twilio_config) self.twilio_sid = None diff --git a/vocode/streaming/telephony/conversation/zoom_dial_in.py b/vocode/streaming/telephony/conversation/zoom_dial_in.py index aa6766d..d57f1d6 100644 --- a/vocode/streaming/telephony/conversation/zoom_dial_in.py +++ b/vocode/streaming/telephony/conversation/zoom_dial_in.py @@ -24,10 +24,10 @@ class ZoomDialIn(OutboundCall): zoom_meeting_password: Optional[str], from_phone: str, config_manager: BaseConfigManager, - twilio_config: TwilioConfig, agent_config: AgentConfig, transcriber_config: TranscriberConfig, synthesizer_config: SynthesizerConfig, + twilio_config: Optional[TwilioConfig] = None, conversation_id: Optional[str] = None, logger: Optional[logging.Logger] = None, ): diff --git a/vocode/streaming/telephony/server/base.py b/vocode/streaming/telephony/server/base.py index ed10892..8be5688 100644 --- a/vocode/streaming/telephony/server/base.py +++ b/vocode/streaming/telephony/server/base.py @@ -34,7 +34,6 @@ from vocode.streaming.models.telephony import ( EndOutboundCall, TwilioConfig, ) -from twilio.rest import Client from vocode.streaming.telephony.conversation.call import Call from vocode.streaming.telephony.templates import Templater @@ -45,7 +44,7 @@ from vocode.streaming.utils import create_conversation_id class InboundCallConfig(BaseModel): url: str agent_config: AgentConfig - twilio_config: TwilioConfig + twilio_config: Optional[TwilioConfig] = None transcriber_config: Optional[TranscriberConfig] = None synthesizer_config: Optional[SynthesizerConfig] = None @@ -92,7 +91,7 @@ class TelephonyServer: def create_inbound_route( self, agent_config: AgentConfig, - twilio_config: TwilioConfig, + twilio_config: Optional[TwilioConfig] = None, transcriber_config: Optional[TranscriberConfig] = None, synthesizer_config: Optional[SynthesizerConfig] = None, ): diff --git a/vocode/streaming/telephony/twilio.py b/vocode/streaming/telephony/twilio.py index 37acbe0..9fe7765 100644 --- a/vocode/streaming/telephony/twilio.py +++ b/vocode/streaming/telephony/twilio.py @@ -1,12 +1,8 @@ -import os from typing import Optional -from dotenv import load_dotenv from twilio.rest import Client from vocode.streaming.models.telephony import TwilioConfig -load_dotenv() - def create_twilio_client(twilio_config: TwilioConfig): return Client(twilio_config.account_sid, twilio_config.auth_token) diff --git a/vocode/streaming/transcriber/assembly_ai_transcriber.py b/vocode/streaming/transcriber/assembly_ai_transcriber.py index 3389d9d..6780a8e 100644 --- a/vocode/streaming/transcriber/assembly_ai_transcriber.py +++ b/vocode/streaming/transcriber/assembly_ai_transcriber.py @@ -1,10 +1,9 @@ import asyncio import json import logging -import os -from dotenv import load_dotenv import websockets from urllib.parse import urlencode +from vocode import getenv from vocode.streaming.models.transcriber import AssemblyAITranscriberConfig from vocode.streaming.models.websocket import AudioMessage @@ -14,9 +13,7 @@ from vocode.streaming.transcriber.base_transcriber import ( ) from vocode.streaming.models.audio_encoding import AudioEncoding -load_dotenv() -ASSEMBLY_AI_API_KEY = os.environ.get("ASSEMBLY_AI_API_KEY") ASSEMBLY_AI_URL = "wss://api.assemblyai.com/v2/realtime/ws" @@ -27,6 +24,7 @@ class AssemblyAITranscriber(BaseTranscriber): logger: logging.Logger = None, ): super().__init__(transcriber_config) + self.api_key = getenv("ASSEMBLY_AI_API_KEY") self._ended = False self.is_ready = False self.logger = logger or logging.getLogger(__name__) @@ -61,7 +59,7 @@ class AssemblyAITranscriber(BaseTranscriber): async with websockets.connect( URL, - extra_headers=(("Authorization", ASSEMBLY_AI_API_KEY),), + extra_headers=(("Authorization", self.api_key),), ping_interval=5, ping_timeout=20, ) as ws: diff --git a/vocode/streaming/transcriber/base_transcriber.py b/vocode/streaming/transcriber/base_transcriber.py index 7c9aa0b..2e42713 100644 --- a/vocode/streaming/transcriber/base_transcriber.py +++ b/vocode/streaming/transcriber/base_transcriber.py @@ -1,11 +1,8 @@ -from dotenv import load_dotenv from typing import Callable, Optional, Awaitable from vocode.streaming.utils import convert_wav from vocode.streaming.models.transcriber import EndpointingConfig, TranscriberConfig -load_dotenv() - class Transcription: def __init__( diff --git a/vocode/streaming/transcriber/deepgram_transcriber.py b/vocode/streaming/transcriber/deepgram_transcriber.py index 2ff2387..c475b60 100644 --- a/vocode/streaming/transcriber/deepgram_transcriber.py +++ b/vocode/streaming/transcriber/deepgram_transcriber.py @@ -1,12 +1,11 @@ import asyncio import json import logging -import os -from dotenv import load_dotenv import websockets from websockets.client import WebSocketClientProtocol import audioop from urllib.parse import urlencode +from vocode import getenv from vocode.streaming.transcriber.base_transcriber import ( BaseTranscriber, @@ -19,9 +18,7 @@ from vocode.streaming.models.transcriber import ( ) from vocode.streaming.models.audio_encoding import AudioEncoding -load_dotenv() -DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY") PUNCTUATION_TERMINATORS = [".", "!", "?"] NUM_RESTARTS = 5 @@ -33,6 +30,7 @@ class DeepgramTranscriber(BaseTranscriber): logger: logging.Logger = None, ): super().__init__(transcriber_config) + self.api_key = getenv("DEEPGRAM_API_KEY") self.transcriber_config = transcriber_config self._ended = False self.warmed_up = False @@ -155,7 +153,7 @@ class DeepgramTranscriber(BaseTranscriber): return data["duration"] async def process(self, warmup=True): - extra_headers = {"Authorization": f"Token {DEEPGRAM_API_KEY}"} + extra_headers = {"Authorization": f"Token {self.api_key}"} self.audio_queue = asyncio.Queue() async with websockets.connect( diff --git a/vocode/streaming/utils/goodbye_model.py b/vocode/streaming/utils/goodbye_model.py index 6ad95cf..e15b051 100644 --- a/vocode/streaming/utils/goodbye_model.py +++ b/vocode/streaming/utils/goodbye_model.py @@ -1,19 +1,12 @@ -import os import asyncio import openai -from dotenv import load_dotenv import numpy as np import requests -load_dotenv() -openai.api_key = os.getenv("OPENAI_API_KEY") +from vocode import getenv - -PLATFORM = "pyq" if os.getenv("USE_PYQ_EMBEDDINGS", "false") == "true" else "openai" SIMILARITY_THRESHOLD = 0.9 -SIMILARITY_THRESHOLD_PYQ = 0.7 EMBEDDING_SIZE = 1536 -PYQ_EMBEDDING_SIZE = 768 GOODBYE_PHRASES = [ "bye", "goodbye", @@ -24,7 +17,6 @@ GOODBYE_PHRASES = [ "have a good day", "have a good night", ] -PYQ_API_URL = "https://embeddings.pyqai.com" class GoodbyeModel: @@ -34,12 +26,10 @@ class GoodbyeModel: os.path.dirname(__file__), "goodbye_embeddings" ), ): + openai.api_key = getenv("OPENAI_API_KEY") self.goodbye_embeddings = self.load_or_create_embeddings( f"{embeddings_cache_path}/goodbye_embeddings.npy" ) - self.goodbye_embeddings_pyq = self.load_or_create_embeddings( - f"{embeddings_cache_path}/goodbye_embeddings_pyq.npy" - ) def load_or_create_embeddings(self, path): if os.path.exists(path): @@ -49,50 +39,33 @@ class GoodbyeModel: np.save(path, embeddings) return embeddings - def create_embeddings(self, platform=PLATFORM): + def create_embeddings(self): print("Creating embeddings...") - size = EMBEDDING_SIZE if platform == "openai" else PYQ_EMBEDDING_SIZE + size = EMBEDDING_SIZE embeddings = np.empty((size, len(GOODBYE_PHRASES))) for i, goodbye_phrase in enumerate(GOODBYE_PHRASES): - embeddings[:, i] = self.create_embedding(goodbye_phrase, platform=platform) + embeddings[:, i] = self.create_embedding(goodbye_phrase) return embeddings - async def is_goodbye(self, text: str, platform=PLATFORM) -> bool: + async def is_goodbye(self, text: str) -> bool: if "bye" in text.lower(): return True - embedding = self.create_embedding(text.strip().lower(), platform=platform) - goodbye_embeddings = ( - self.goodbye_embeddings - if platform == "openai" - else self.goodbye_embeddings_pyq - ) - threshold = ( - SIMILARITY_THRESHOLD if platform == "openai" else SIMILARITY_THRESHOLD_PYQ - ) - similarity_results = embedding @ goodbye_embeddings - return np.max(similarity_results) > threshold + embedding = self.create_embedding(text.strip().lower()) + similarity_results = embedding @ self.goodbye_embeddings + return np.max(similarity_results) > SIMILARITY_THRESHOLD - def create_embedding(self, text, platform=PLATFORM) -> np.array: - if platform == "openai": - return np.array( - openai.Embedding.create(input=text, model="text-embedding-ada-002")[ - "data" - ][0]["embedding"] - ) - elif platform == "pyq": - return np.array( - requests.post( - PYQ_API_URL, - headers={ - "Content-Type": "application/json", - "Authorization": os.getenv("PYQ_API_KEY"), - }, - json={"input_sequence": [text], "account_id": "400"}, - ).json()["response"][0] - ) + def create_embedding(self, text) -> np.array: + return np.array( + openai.Embedding.create(input=text, model="text-embedding-ada-002")["data"][ + 0 + ]["embedding"] + ) if __name__ == "__main__": + from dotenv import load_dotenv + + load_dotenv() async def main(): model = GoodbyeModel() diff --git a/vocode/turn_based/agent/chat_gpt_agent.py b/vocode/turn_based/agent/chat_gpt_agent.py index 693a4fa..187311d 100644 --- a/vocode/turn_based/agent/chat_gpt_agent.py +++ b/vocode/turn_based/agent/chat_gpt_agent.py @@ -1,4 +1,3 @@ -import os from typing import Optional import openai from langchain.prompts import ( @@ -10,6 +9,7 @@ from langchain.prompts import ( from langchain.chains import ConversationChain from langchain.chat_models import ChatOpenAI from langchain.memory import ConversationBufferMemory +from vocode import getenv from vocode.turn_based.agent.base_agent import BaseAgent @@ -25,7 +25,7 @@ class ChatGPTAgent(BaseAgent): max_tokens: int = 100, ): super().__init__(initial_message=initial_message) - openai.api_key = os.getenv("OPENAI_API_KET", api_key) + openai.api_key = getenv("OPENAI_API_KET", api_key) if not openai.api_key: raise ValueError("OpenAI API key not provided") self.prompt = ChatPromptTemplate.from_messages( diff --git a/vocode/turn_based/synthesizer/azure_synthesizer.py b/vocode/turn_based/synthesizer/azure_synthesizer.py index c88a663..f893913 100644 --- a/vocode/turn_based/synthesizer/azure_synthesizer.py +++ b/vocode/turn_based/synthesizer/azure_synthesizer.py @@ -1,7 +1,7 @@ -import os from typing import Optional import azure.cognitiveservices.speech as speechsdk from pydub import AudioSegment +from vocode import getenv from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer @@ -15,8 +15,8 @@ class AzureSynthesizer(BaseSynthesizer): ): self.sampling_rate = sampling_rate speech_config = speechsdk.SpeechConfig( - subscription=os.getenv("AZURE_SPEECH_KEY", api_key), - region=os.getenv("AZURE_SPEECH_REGION", region), + subscription=getenv("AZURE_SPEECH_KEY", api_key), + region=getenv("AZURE_SPEECH_REGION", region), ) if self.sampling_rate == 44100: speech_config.set_speech_synthesis_output_format( diff --git a/vocode/turn_based/synthesizer/eleven_labs_synthesizer.py b/vocode/turn_based/synthesizer/eleven_labs_synthesizer.py index 018b895..d819d28 100644 --- a/vocode/turn_based/synthesizer/eleven_labs_synthesizer.py +++ b/vocode/turn_based/synthesizer/eleven_labs_synthesizer.py @@ -1,8 +1,8 @@ import io -import os from typing import Optional from pydub import AudioSegment import requests +from vocode import getenv from vocode.turn_based.synthesizer.base_synthesizer import BaseSynthesizer ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/" @@ -11,7 +11,7 @@ ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/" class ElevenLabsSynthesizer(BaseSynthesizer): def __init__(self, voice_id: str, api_key: Optional[str] = None): self.voice_id = voice_id - self.api_key = os.getenv("ELEVEN_LABS_API_KEY", api_key) + self.api_key = getenv("ELEVEN_LABS_API_KEY", api_key) def synthesize(self, text: str) -> AudioSegment: url = ELEVEN_LABS_BASE_URL + f"text-to-speech/{self.voice_id}" diff --git a/vocode/turn_based/transcriber/whisper_transcriber.py b/vocode/turn_based/transcriber/whisper_transcriber.py index 24c59d0..7c2d1cb 100644 --- a/vocode/turn_based/transcriber/whisper_transcriber.py +++ b/vocode/turn_based/transcriber/whisper_transcriber.py @@ -1,15 +1,15 @@ from typing import Optional from pydub import AudioSegment import io -import os import openai +from vocode import getenv from vocode.turn_based.transcriber.base_transcriber import BaseTranscriber class WhisperTranscriber(BaseTranscriber): def __init__(self, api_key: Optional[str] = None): - openai.api_key = os.getenv("OPENAI_API_KEY", api_key) + openai.api_key = getenv("OPENAI_API_KEY", api_key) if not openai.api_key: raise ValueError("OpenAI API key not provided")