remove pyq goodbye model and rime synthesizer and fix environment loading

This commit is contained in:
Ajay Raj 2023-03-28 10:20:36 -07:00
commit 1dc7bc74c3
28 changed files with 143 additions and 285 deletions

View file

@ -1,4 +1,3 @@
import os
import random
import time
from langchain.prompts import (
@ -16,23 +15,20 @@ import openai
import json
from typing import Generator, Optional
from dotenv import load_dotenv
from typing import Generator
import logging
from vocode import getenv
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.models.agent import ChatGPTAgentConfig
from vocode.streaming.utils.sse_client import SSEClient
from vocode.streaming.agent.utils import stream_llm_response
load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")
class ChatGPTAgent(BaseAgent):
def __init__(self, agent_config: ChatGPTAgentConfig, logger: logging.Logger = None):
super().__init__(agent_config)
openai.api_key = getenv("OPENAI_API_KEY")
self.agent_config = agent_config
self.logger = logger or logging.getLogger(__name__)
self.logger.setLevel(logging.DEBUG)
@ -112,7 +108,7 @@ class ChatGPTAgent(BaseAgent):
"https://api.openai.com/v1/chat/completions",
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"Authorization": f"Bearer {getenv('OPENAI_API_KEY')}",
},
json={
"model": self.agent_config.model_name,

View file

@ -1,7 +1,6 @@
import re
from typing import Optional
from dotenv import load_dotenv
from langchain import OpenAI
from langchain.llms import OpenAIChat
from typing import Generator
@ -11,8 +10,6 @@ from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.agent.utils import stream_llm_response
from vocode.streaming.models.agent import LLMAgentConfig
load_dotenv()
class LLMAgent(BaseAgent):
SENTENCE_ENDINGS = [".", "!", "?"]

View file

@ -10,7 +10,6 @@ from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
from vocode.streaming.transcriber.base_transcriber import BaseTranscriber
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
@ -48,11 +47,6 @@ def create_synthesizer(synthesizer_config: SynthesizerConfig) -> BaseSynthesizer
elif synthesizer_config.type == SynthesizerType.AZURE:
return AzureSynthesizer(synthesizer_config)
elif synthesizer_config.type == SynthesizerType.ELEVEN_LABS:
kwargs = {}
if synthesizer_config.voice_id:
kwargs["voice_id"] = synthesizer_config.voice_id
return ElevenLabsSynthesizer(synthesizer_config, **kwargs)
elif synthesizer_config.type == SynthesizerType.RIME:
return RimeSynthesizer(synthesizer_config)
return ElevenLabsSynthesizer(synthesizer_config)
else:
raise Exception("Invalid synthesizer config")

View file

@ -2,8 +2,6 @@ import websockets
from websockets.exceptions import ConnectionClosedOK
from websockets.client import WebSocketClientProtocol
import asyncio
from dotenv import load_dotenv
import os
import logging
import threading
import queue
@ -22,8 +20,6 @@ from vocode.streaming.models.websocket import (
StopMessage,
)
load_dotenv()
class HostedStreamingConversation:
def __init__(

View file

@ -8,15 +8,18 @@ import time
import secrets
import random
from dotenv import load_dotenv
from vocode.streaming.agent.bot_sentiment_analyser import (
BotSentiment,
BotSentimentAnalyser,
)
from vocode.streaming.agent.information_retrieval_agent import InformationRetrievalAgent
from vocode.streaming.factory import (
create_agent,
create_synthesizer,
create_transcriber,
)
from vocode.streaming.models.message import BaseMessage
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.synthesizer.rime_synthesizer import RimeSynthesizer
from vocode.streaming.transcriber.assembly_ai_transcriber import AssemblyAITranscriber
from vocode.streaming.utils.goodbye_model import GoodbyeModel
from vocode.streaming.utils.transcript import Transcript
@ -48,9 +51,6 @@ from vocode.streaming.synthesizer.base_synthesizer import (
SynthesisResult,
FillerAudio,
)
from vocode.streaming.synthesizer.google_synthesizer import GoogleSynthesizer
from vocode.streaming.synthesizer.azure_synthesizer import AzureSynthesizer
from vocode.streaming.synthesizer.eleven_labs_synthesizer import ElevenLabsSynthesizer
from vocode.streaming.utils import (
create_conversation_id,
create_loop_in_thread,
@ -60,19 +60,15 @@ from vocode.streaming.transcriber.base_transcriber import (
Transcription,
BaseTranscriber,
)
from vocode.streaming.transcriber.google_transcriber import GoogleTranscriber
from vocode.streaming.transcriber.deepgram_transcriber import DeepgramTranscriber
load_dotenv()
class StreamingConversation:
def __init__(
self,
output_device: BaseOutputDevice,
transcriber: BaseTranscriber,
agent: BaseAgent,
synthesizer: BaseSynthesizer,
transcriber_config: TranscriberConfig,
agent_config: AgentConfig,
synthesizer_config: SynthesizerConfig,
conversation_id: str = None,
per_chunk_allowance_seconds: int = PER_CHUNK_ALLOWANCE_SECONDS,
logger: Optional[logging.Logger] = None,
@ -80,11 +76,11 @@ class StreamingConversation:
self.id = conversation_id or create_conversation_id()
self.logger = logger or logging.getLogger(__name__)
self.output_device = output_device
self.transcriber = transcriber
self.transcriber = create_transcriber(transcriber_config)
self.transcriber.set_on_response(self.on_transcription_response)
self.transcriber_task = None
self.agent = agent
self.synthesizer = synthesizer
self.agent = create_agent(agent_config)
self.synthesizer = create_synthesizer(synthesizer_config)
self.synthesizer_event_loop = asyncio.new_event_loop()
self.synthesizer_thread = threading.Thread(
name="synthesizer",

View file

@ -4,7 +4,7 @@ import re
from typing import Any, Optional
from xml.etree import ElementTree
import azure.cognitiveservices.speech as speechsdk
from dotenv import load_dotenv
from vocode import getenv
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.message import BaseMessage, SSMLMessage
@ -20,7 +20,6 @@ from vocode.streaming.synthesizer.base_synthesizer import (
from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
from vocode.streaming.models.audio_encoding import AudioEncoding
load_dotenv()
NAMESPACES = {
"mstts": "https://www.w3.org/2001/mstts",
@ -59,8 +58,8 @@ class AzureSynthesizer(BaseSynthesizer):
self.synthesizer_config = synthesizer_config
# Instantiates a client
speech_config = speechsdk.SpeechConfig(
subscription=os.environ.get("AZURE_SPEECH_KEY"),
region=os.environ.get("AZURE_SPEECH_REGION"),
subscription=getenv("AZURE_SPEECH_KEY"),
region=getenv("AZURE_SPEECH_REGION"),
)
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
if self.synthesizer_config.sampling_rate == 44100:

View file

@ -1,7 +1,6 @@
from typing import Any, Optional
import os
from dotenv import load_dotenv
import requests
from vocode import getenv
from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer,
@ -11,9 +10,7 @@ from vocode.streaming.models.synthesizer import ElevenLabsSynthesizerConfig
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.message import BaseMessage
load_dotenv()
ELEVEN_LABS_API_KEY = os.environ.get("ELEVEN_LABS_API_KEY")
ELEVEN_LABS_BASE_URL = "https://api.elevenlabs.io/v1/"
ADAM_VOICE_ID = "pNInz6obpgDQGcFmaJgB"
OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"
@ -22,7 +19,7 @@ OBAMA_VOICE_ID = "vLITIS0SH2an5iQGxw5C"
class ElevenLabsSynthesizer(BaseSynthesizer):
def __init__(self, config: ElevenLabsSynthesizerConfig):
super().__init__(config)
self.api_key = config.api_key
self.api_key = getenv("ELEVEN_LABS_API_KEY")
self.voice_id = config.voice_id or ADAM_VOICE_ID
self.words_per_minute = 150

View file

@ -2,7 +2,6 @@ import io
import wave
from typing import Any, Optional
from dotenv import load_dotenv
from google.cloud import texttospeech_v1beta1 as tts
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
@ -16,8 +15,6 @@ from vocode.streaming.models.synthesizer import GoogleSynthesizerConfig
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.utils import convert_wav
load_dotenv()
class GoogleSynthesizer(BaseSynthesizer):
OFFSET_SECONDS = 0.5

View file

@ -1,78 +0,0 @@
import audioop
import base64
from vocode.streaming.agent.bot_sentiment_analyser import BotSentiment
from vocode.streaming.models.audio_encoding import AudioEncoding
from vocode.streaming.models.message import BaseMessage
from .base_synthesizer import BaseSynthesizer, SynthesisResult, encode_as_wav
from typing import Any, Optional
import os
import io
import wave
from dotenv import load_dotenv
import requests
from ..utils import convert_linear_audio, convert_wav
from ..models.synthesizer import ElevenLabsSynthesizerConfig, RimeSynthesizerConfig
load_dotenv()
RIME_API_KEY = os.getenv("RIME_API_KEY")
RIME_BASE_URL = os.getenv("RIME_BASE_URL")
class RimeSynthesizer(BaseSynthesizer):
def __init__(self, config: RimeSynthesizerConfig):
super().__init__(config)
self.speaker = config.speaker
def create_speech(
self,
message: BaseMessage,
chunk_size: int,
bot_sentiment: Optional[BotSentiment] = None,
) -> SynthesisResult:
url = RIME_BASE_URL
headers = {"Authorization": f"Bearer {RIME_API_KEY}"}
body = {"inputs": {"text": message.text, "speaker": self.speaker}}
response = requests.post(url, headers=headers, json=body)
def chunk_generator(audio, chunk_transform=lambda x: x):
for i in range(0, len(audio), chunk_size):
chunk = audio[i : i + chunk_size]
yield SynthesisResult.ChunkResult(
chunk_transform(chunk), len(chunk) != chunk_size
)
assert response.ok, response.text
data = response.json().get("data")
assert data
audio_file = io.BytesIO(base64.b64decode(data))
if self.synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
output_bytes = convert_wav(
audio_file,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=AudioEncoding.LINEAR16,
)
elif self.synthesizer_config.audio_encoding == AudioEncoding.MULAW:
output_bytes = convert_wav(
audio_file,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=AudioEncoding.MULAW,
)
if self.synthesizer_config.should_encode_as_wav:
output_generator = chunk_generator(
output_bytes, chunk_transform=encode_as_wav
)
else:
output_generator = chunk_generator(output_bytes)
return SynthesisResult(
output_generator,
lambda seconds: self.get_message_cutoff_from_total_response_length(
message, seconds, len(output_bytes)
),
)

View file

@ -1,5 +1,4 @@
import logging
import os
from typing import Optional
from redis import Redis

View file

@ -4,6 +4,7 @@ from enum import Enum
import json
import logging
from typing import Optional
from vocode import getenv
from vocode.streaming.agent.base_agent import BaseAgent
from vocode.streaming.factory import (
create_agent,
@ -42,38 +43,36 @@ class Call(StreamingConversation):
self,
base_url: str,
config_manager: BaseConfigManager,
agent: BaseAgent,
twilio_config: TwilioConfig,
transcriber: Optional[BaseTranscriber] = None,
synthesizer: Optional[BaseSynthesizer] = None,
twilio_sid=None,
agent_config: BaseAgent,
transcriber_config: Optional[BaseTranscriber] = None,
synthesizer_config: Optional[BaseSynthesizer] = None,
twilio_config: Optional[TwilioConfig] = None,
twilio_sid: Optional[str] = None,
conversation_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
):
self.base_url = base_url
self.config_manager = config_manager
self.output_device = TwilioOutputDevice()
self.twilio_config = twilio_config
self.twilio_config = twilio_config or TwilioConfig(
account_sid=getenv("TWILIO_ACCOUNT_SID"),
auth_token=getenv("TWILIO_AUTH_TOKEN"),
)
self.twilio_client = create_twilio_client(twilio_config)
super().__init__(
self.output_device,
transcriber
or DeepgramTranscriber(
DeepgramTranscriberConfig(
sampling_rate=8000,
audio_encoding=AudioEncoding.MULAW,
chunk_size=self.CHUNK_SIZE,
model="voicemail",
endpointing_config=PunctuationEndpointingConfig(),
),
logger=logger,
transcriber_config
or DeepgramTranscriberConfig(
sampling_rate=8000,
audio_encoding=AudioEncoding.MULAW,
chunk_size=self.CHUNK_SIZE,
model="voicemail",
endpointing_config=PunctuationEndpointingConfig(),
),
agent,
synthesizer
or AzureSynthesizer(
AzureSynthesizerConfig(
sampling_rate=8000, audio_encoding=AudioEncoding.MULAW
)
agent_config,
synthesizer_config
or AzureSynthesizerConfig(
sampling_rate=8000, audio_encoding=AudioEncoding.MULAW
),
conversation_id=conversation_id,
per_chunk_allowance_seconds=0.01,
@ -94,9 +93,9 @@ class Call(StreamingConversation):
base_url=base_url,
logger=logger,
config_manager=config_manager,
agent=create_agent(call_config.agent_config),
transcriber=create_transcriber(call_config.transcriber_config),
synthesizer=create_synthesizer(call_config.synthesizer_config),
agent_config=call_config.agent_config,
transcriber_config=call_config.transcriber_config,
synthesizer_config=call_config.synthesizer_config,
twilio_config=call_config.twilio_config,
twilio_sid=call_config.twilio_sid,
conversation_id=conversation_id,

View file

@ -1,6 +1,6 @@
import logging
from typing import Optional
from twilio.rest import Client
from vocode import getenv
from vocode.streaming.models.agent import AgentConfig
from vocode.streaming.models.synthesizer import (
@ -33,7 +33,7 @@ class OutboundCall:
from_phone: str,
config_manager: BaseConfigManager,
agent_config: AgentConfig,
twilio_config: TwilioConfig,
twilio_config: Optional[TwilioConfig] = None,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
conversation_id: Optional[str] = None,
@ -56,7 +56,10 @@ class OutboundCall:
)
self.conversation_id = conversation_id or create_conversation_id()
self.logger = logger
self.twilio_config = twilio_config
self.twilio_config = twilio_config or TwilioConfig(
account_sid=getenv("TWILIO_ACCOUNT_SID"),
auth_token=getenv("TWILIO_AUTH_TOKEN"),
)
self.twilio_client = create_twilio_client(twilio_config)
self.twilio_sid = None

View file

@ -24,10 +24,10 @@ class ZoomDialIn(OutboundCall):
zoom_meeting_password: Optional[str],
from_phone: str,
config_manager: BaseConfigManager,
twilio_config: TwilioConfig,
agent_config: AgentConfig,
transcriber_config: TranscriberConfig,
synthesizer_config: SynthesizerConfig,
twilio_config: Optional[TwilioConfig] = None,
conversation_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
):

View file

@ -34,7 +34,6 @@ from vocode.streaming.models.telephony import (
EndOutboundCall,
TwilioConfig,
)
from twilio.rest import Client
from vocode.streaming.telephony.conversation.call import Call
from vocode.streaming.telephony.templates import Templater
@ -45,7 +44,7 @@ from vocode.streaming.utils import create_conversation_id
class InboundCallConfig(BaseModel):
url: str
agent_config: AgentConfig
twilio_config: TwilioConfig
twilio_config: Optional[TwilioConfig] = None
transcriber_config: Optional[TranscriberConfig] = None
synthesizer_config: Optional[SynthesizerConfig] = None
@ -92,7 +91,7 @@ class TelephonyServer:
def create_inbound_route(
self,
agent_config: AgentConfig,
twilio_config: TwilioConfig,
twilio_config: Optional[TwilioConfig] = None,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
):

View file

@ -1,12 +1,8 @@
import os
from typing import Optional
from dotenv import load_dotenv
from twilio.rest import Client
from vocode.streaming.models.telephony import TwilioConfig
load_dotenv()
def create_twilio_client(twilio_config: TwilioConfig):
return Client(twilio_config.account_sid, twilio_config.auth_token)

View file

@ -1,10 +1,9 @@
import asyncio
import json
import logging
import os
from dotenv import load_dotenv
import websockets
from urllib.parse import urlencode
from vocode import getenv
from vocode.streaming.models.transcriber import AssemblyAITranscriberConfig
from vocode.streaming.models.websocket import AudioMessage
@ -14,9 +13,7 @@ from vocode.streaming.transcriber.base_transcriber import (
)
from vocode.streaming.models.audio_encoding import AudioEncoding
load_dotenv()
ASSEMBLY_AI_API_KEY = os.environ.get("ASSEMBLY_AI_API_KEY")
ASSEMBLY_AI_URL = "wss://api.assemblyai.com/v2/realtime/ws"
@ -27,6 +24,7 @@ class AssemblyAITranscriber(BaseTranscriber):
logger: logging.Logger = None,
):
super().__init__(transcriber_config)
self.api_key = getenv("ASSEMBLY_AI_API_KEY")
self._ended = False
self.is_ready = False
self.logger = logger or logging.getLogger(__name__)
@ -61,7 +59,7 @@ class AssemblyAITranscriber(BaseTranscriber):
async with websockets.connect(
URL,
extra_headers=(("Authorization", ASSEMBLY_AI_API_KEY),),
extra_headers=(("Authorization", self.api_key),),
ping_interval=5,
ping_timeout=20,
) as ws:

View file

@ -1,11 +1,8 @@
from dotenv import load_dotenv
from typing import Callable, Optional, Awaitable
from vocode.streaming.utils import convert_wav
from vocode.streaming.models.transcriber import EndpointingConfig, TranscriberConfig
load_dotenv()
class Transcription:
def __init__(

View file

@ -1,12 +1,11 @@
import asyncio
import json
import logging
import os
from dotenv import load_dotenv
import websockets
from websockets.client import WebSocketClientProtocol
import audioop
from urllib.parse import urlencode
from vocode import getenv
from vocode.streaming.transcriber.base_transcriber import (
BaseTranscriber,
@ -19,9 +18,7 @@ from vocode.streaming.models.transcriber import (
)
from vocode.streaming.models.audio_encoding import AudioEncoding
load_dotenv()
DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY")
PUNCTUATION_TERMINATORS = [".", "!", "?"]
NUM_RESTARTS = 5
@ -33,6 +30,7 @@ class DeepgramTranscriber(BaseTranscriber):
logger: logging.Logger = None,
):
super().__init__(transcriber_config)
self.api_key = getenv("DEEPGRAM_API_KEY")
self.transcriber_config = transcriber_config
self._ended = False
self.warmed_up = False
@ -155,7 +153,7 @@ class DeepgramTranscriber(BaseTranscriber):
return data["duration"]
async def process(self, warmup=True):
extra_headers = {"Authorization": f"Token {DEEPGRAM_API_KEY}"}
extra_headers = {"Authorization": f"Token {self.api_key}"}
self.audio_queue = asyncio.Queue()
async with websockets.connect(

View file

@ -1,19 +1,12 @@
import os
import asyncio
import openai
from dotenv import load_dotenv
import numpy as np
import requests
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
from vocode import getenv
PLATFORM = "pyq" if os.getenv("USE_PYQ_EMBEDDINGS", "false") == "true" else "openai"
SIMILARITY_THRESHOLD = 0.9
SIMILARITY_THRESHOLD_PYQ = 0.7
EMBEDDING_SIZE = 1536
PYQ_EMBEDDING_SIZE = 768
GOODBYE_PHRASES = [
"bye",
"goodbye",
@ -24,7 +17,6 @@ GOODBYE_PHRASES = [
"have a good day",
"have a good night",
]
PYQ_API_URL = "https://embeddings.pyqai.com"
class GoodbyeModel:
@ -34,12 +26,10 @@ class GoodbyeModel:
os.path.dirname(__file__), "goodbye_embeddings"
),
):
openai.api_key = getenv("OPENAI_API_KEY")
self.goodbye_embeddings = self.load_or_create_embeddings(
f"{embeddings_cache_path}/goodbye_embeddings.npy"
)
self.goodbye_embeddings_pyq = self.load_or_create_embeddings(
f"{embeddings_cache_path}/goodbye_embeddings_pyq.npy"
)
def load_or_create_embeddings(self, path):
if os.path.exists(path):
@ -49,50 +39,33 @@ class GoodbyeModel:
np.save(path, embeddings)
return embeddings
def create_embeddings(self, platform=PLATFORM):
def create_embeddings(self):
print("Creating embeddings...")
size = EMBEDDING_SIZE if platform == "openai" else PYQ_EMBEDDING_SIZE
size = EMBEDDING_SIZE
embeddings = np.empty((size, len(GOODBYE_PHRASES)))
for i, goodbye_phrase in enumerate(GOODBYE_PHRASES):
embeddings[:, i] = self.create_embedding(goodbye_phrase, platform=platform)
embeddings[:, i] = self.create_embedding(goodbye_phrase)
return embeddings
async def is_goodbye(self, text: str, platform=PLATFORM) -> bool:
async def is_goodbye(self, text: str) -> bool:
if "bye" in text.lower():
return True
embedding = self.create_embedding(text.strip().lower(), platform=platform)
goodbye_embeddings = (
self.goodbye_embeddings
if platform == "openai"
else self.goodbye_embeddings_pyq
)
threshold = (
SIMILARITY_THRESHOLD if platform == "openai" else SIMILARITY_THRESHOLD_PYQ
)
similarity_results = embedding @ goodbye_embeddings
return np.max(similarity_results) > threshold
embedding = self.create_embedding(text.strip().lower())
similarity_results = embedding @ self.goodbye_embeddings
return np.max(similarity_results) > SIMILARITY_THRESHOLD
def create_embedding(self, text, platform=PLATFORM) -> np.array:
if platform == "openai":
return np.array(
openai.Embedding.create(input=text, model="text-embedding-ada-002")[
"data"
][0]["embedding"]
)
elif platform == "pyq":
return np.array(
requests.post(
PYQ_API_URL,
headers={
"Content-Type": "application/json",
"Authorization": os.getenv("PYQ_API_KEY"),
},
json={"input_sequence": [text], "account_id": "400"},
).json()["response"][0]
)
def create_embedding(self, text) -> np.array:
return np.array(
openai.Embedding.create(input=text, model="text-embedding-ada-002")["data"][
0
]["embedding"]
)
if __name__ == "__main__":
from dotenv import load_dotenv
load_dotenv()
async def main():
model = GoodbyeModel()