fix: enhancements for the AssemblyAI component (#3934)

Enhancements for AssemblyAI component

Co-authored-by: Patrick Loeber <98830383+ploeber@users.noreply.github.com>
This commit is contained in:
Patrick Loeber 2024-10-01 16:28:18 +02:00 committed by GitHub
commit d19c16462b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 137 additions and 116 deletions

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 392 KiB

After

Width:  |  Height:  |  Size: 233 KiB

Before After
Before After

View file

@ -63,22 +63,12 @@ This components allows you to poll the transcripts. It checks the status of the
- **Input**:
- AssemblyAI API Key: Your API key.
- Polling Interval: The polling interval in seconds. Default is 3.
- Polling Interval (Optional): The polling interval in seconds. Default is 3.
- **Output**:
- Transcription Result: The AssemblyAI JSON response of a completed transcript. Contains the text and other info.
### AssebmlyAI Parse Transcript
This component allows you to parse a *Transcription Result* and outputs the formatted text.
- **Input**:
- Transcription Result: The output of the *Poll Transcript* component.
- **Output**:
- Parsed transcription: The parsed transcript. If Speaker Labels was enabled in the *Start Transcript* component, it formats utterances with speakers and timestamps.
### AssebmlyAI Get Subtitles
This component allows you to generate subtitles in SRT or VTT format.
@ -90,7 +80,7 @@ This component allows you to generate subtitles in SRT or VTT format.
- Character per Caption (Optional): The maximum number of characters per caption (0 for no limit).
- **Output**:
- Parsed transcription: The parsed transcript. If Speaker Labels was enabled in the *Start Transcript* component, it formats utterances with speakers and timestamps.
- Subtitles: A JSON response with the `subtitles` field containing the captions in SRT or VTT format.
### AssebmlyAI LeMUR
@ -106,6 +96,9 @@ LeMUR automatically ingests the transcript as additional context, making it easy
- Final Model: The model that is used for the final prompt after compression is performed. Default is Claude 3.5 Sonnet.
- Temperature (Optional): The temperature to use for the model. Default is 0.0.
- Max Output Size (Optional): Max output size in tokens, up to 4000. Default is 2000.
- Endpoint (Optional): The LeMUR endpoint to use. Default is "task". For "summary" and "question-answer", no prompt input is needed. See [LeMUR API docs](https://www.assemblyai.com/docs/api-reference/lemur/) for more info.
- Questions (Optional): Comma-separated list of your questions. Only used if *Endpoint* is "question-answer".
- Transcript IDs (Optional): Comma-separated list of transcript IDs. LeMUR can perform actions over multiple transcripts. If provided, the *Transcription Result* is ignored.
- **Output**:
- LeMUR Response: The generated LLM response.
@ -131,7 +124,7 @@ This component can be used as a standalone component to list all previously gene
2. The user can also input an LLM prompt. In this example, we want to generate a summary of the transcript.
3. The flow submits the audio file for transcription.
4. The flow checks the status of the transcript every few seconds until transcription is completed.
5. The flow parses the transcript and outputs the formatted text.
5. The flow parses the transcription result and outputs the transcribed text.
6. The flow also generates subtitles.
7. The flow applies the LLM prompt to generate a summary.
8. As a standalone component, all transcripts can be listed.
@ -145,7 +138,7 @@ To run the Transcription and Speech AI Flow:
3. Connect the components as shown in the flow diagram. **Tip**: Freeze the path of the *Start Transcript* component to only submit the file once.
4. Input the AssemblyAI API key in in all components that require the key (Start Transcript, Poll Transcript, Get Subtitles, LeMUR, List Transcripts).
5. Select an audio or video file in the *Start Transcript* component.
6. Run the flow by clicking **Play** on the *Parse Transcript* component.
6. Run the flow by clicking **Play** on the *Parse Data* component. Make sure that the specified template is `{text}`.
7. To generate subtitles, click **Play** on the *Get Subtitles* component.
8. To apply an LLM to your audio file, click **Play** on the *LeMUR* component. Note that you need an upgraded AssemblyAI account to use LeMUR.
9. To list all transcripts, click **Play** on the *List Transcript* component.

View file

@ -74,7 +74,7 @@ dependencies = [
"elasticsearch>=8.12.0",
"pytube>=15.0.0",
"dspy-ai>=2.4.0",
"assemblyai>=0.26.0",
"assemblyai>=0.33.0",
"litellm>=1.44.0",
"chromadb>=0.4",
"langchain-anthropic>=0.1.23",

View file

@ -1,65 +0,0 @@
import datetime
from langflow.custom import Component
from langflow.io import DataInput, Output
from langflow.schema import Data
class AssemblyAITranscriptionParser(Component):
display_name = "AssemblyAI Parse Transcript"
description = (
"Parse AssemblyAI transcription result. "
"If Speaker Labels was enabled, format utterances with speakers and timestamps"
)
documentation = "https://www.assemblyai.com/docs"
icon = "AssemblyAI"
inputs = [
DataInput(
name="transcription_result",
display_name="Transcription Result",
info="The transcription result from AssemblyAI",
),
]
outputs = [
Output(display_name="Parsed Transcription", name="parsed_transcription", method="parse_transcription"),
]
def parse_transcription(self) -> Data:
# check if it's an error message from the previous step
if self.transcription_result.data.get("error"):
self.status = self.transcription_result.data["error"]
return self.transcription_result
try:
transcription_data = self.transcription_result.data
if transcription_data.get("utterances"):
# If speaker diarization was enabled
parsed_result = self.parse_with_speakers(transcription_data["utterances"])
elif transcription_data.get("text"):
# If speaker diarization was not enabled
parsed_result = transcription_data["text"]
else:
raise ValueError("Unexpected transcription format")
self.status = parsed_result
return Data(data={"text": parsed_result})
except Exception as e:
error_message = f"Error parsing transcription: {str(e)}"
self.status = error_message
return Data(data={"error": error_message})
def parse_with_speakers(self, utterances: list[dict]) -> str:
parsed_result = []
for utterance in utterances:
speaker = utterance["speaker"]
start_time = self.format_timestamp(utterance["start"])
text = utterance["text"]
parsed_result.append(f'Speaker {speaker} {start_time}\n"{text}"\n')
return "\n".join(parsed_result)
def format_timestamp(self, milliseconds: int) -> str:
return str(datetime.timedelta(milliseconds=milliseconds)).split(".")[0]

View file

@ -1,7 +1,7 @@
import assemblyai as aai
from langflow.custom import Component
from langflow.io import DataInput, DropdownInput, FloatInput, IntInput, MessageInput, Output, SecretStrInput
from langflow.io import DataInput, DropdownInput, FloatInput, IntInput, MultilineInput, Output, SecretStrInput
from langflow.schema import Data
@ -23,7 +23,7 @@ class AssemblyAILeMUR(Component):
display_name="Transcription Result",
info="The transcription result from AssemblyAI",
),
MessageInput(
MultilineInput(
name="prompt",
display_name="Input Prompt",
info="The text to prompt the model",
@ -34,6 +34,7 @@ class AssemblyAILeMUR(Component):
options=["claude3_5_sonnet", "claude3_opus", "claude3_haiku", "claude3_sonnet"],
value="claude3_5_sonnet",
info="The model that is used for the final prompt after compression is performed",
advanced=True,
),
FloatInput(
name="temperature",
@ -49,6 +50,32 @@ class AssemblyAILeMUR(Component):
value=2000,
info="Max output size in tokens, up to 4000",
),
DropdownInput(
name="endpoint",
display_name="Endpoint",
options=["task", "summary", "question-answer"],
value="task",
info=(
"The LeMUR endpoint to use. For 'summary' and 'question-answer',"
" no prompt input is needed. See https://www.assemblyai.com/docs/api-reference/lemur/ for more info."
),
advanced=True,
),
MultilineInput(
name="questions",
display_name="Questions",
info="Comma-separated list of your questions. Only used if Endpoint is 'question-answer'",
advanced=True,
),
MultilineInput(
name="transcript_ids",
display_name="Transcript IDs",
info=(
"Comma-separated list of transcript IDs. LeMUR can perform actions over multiple transcripts."
" If provided, the Transcription Result is ignored."
),
advanced=True,
),
]
outputs = [
@ -59,41 +86,87 @@ class AssemblyAILeMUR(Component):
"""Use the LeMUR task endpoint to input the LLM prompt."""
aai.settings.api_key = self.api_key
# check if it's an error message from the previous step
if self.transcription_result.data.get("error"):
if not self.transcription_result and not self.transcript_ids:
error = "Either a Transcription Result or Transcript IDs must be provided"
self.status = error
return Data(data={"error": error})
elif self.transcription_result and self.transcription_result.data.get("error"):
# error message from the previous step
self.status = self.transcription_result.data["error"]
return self.transcription_result
if not self.prompt or not self.prompt.text:
self.status = "No prompt specified"
elif self.endpoint == "task" and not self.prompt:
self.status = "No prompt specified for the task endpoint"
return Data(data={"error": "No prompt specified"})
try:
transcript = aai.Transcript.get_by_id(self.transcription_result.data["id"])
except Exception as e:
error = f"Getting transcription failed: {str(e)}"
elif self.endpoint == "question-answer" and not self.questions:
error = "No Questions were provided for the question-answer endpoint"
self.status = error
return Data(data={"error": error})
if transcript.status == aai.TranscriptStatus.completed:
try:
result = transcript.lemur.task(
prompt=self.prompt.text,
final_model=self.get_final_model(self.final_model),
temperature=self.temperature,
max_output_size=self.max_output_size,
)
# Check for valid transcripts
transcript_ids = None
if self.transcription_result and "id" in self.transcription_result.data:
transcript_ids = [self.transcription_result.data["id"]]
elif self.transcript_ids:
transcript_ids = self.transcript_ids.split(",") or []
transcript_ids = [t.strip() for t in transcript_ids]
result = Data(data=result.dict())
self.status = result
return result
except Exception as e:
error = f"An Exception happened while calling LeMUR: {str(e)}"
self.status = error
return Data(data={"error": error})
if not transcript_ids:
error = "Either a valid Transcription Result or valid Transcript IDs must be provided"
self.status = error
return Data(data={"error": error})
# Get TranscriptGroup and check if there is any error
transcript_group = aai.TranscriptGroup(transcript_ids=transcript_ids)
transcript_group, failures = transcript_group.wait_for_completion(return_failures=True)
if failures:
error = f"Getting transcriptions failed: {failures[0]}"
self.status = error
return Data(data={"error": error})
for t in transcript_group.transcripts:
if t.status == aai.TranscriptStatus.error:
self.status = t.error
return Data(data={"error": t.error})
# Perform LeMUR action
try:
response = self.perform_lemur_action(transcript_group, self.endpoint)
result = Data(data=response)
self.status = result
return result
except Exception as e:
error = f"An Error happened: {str(e)}"
self.status = error
return Data(data={"error": error})
def perform_lemur_action(self, transcript_group: aai.TranscriptGroup, endpoint: str) -> dict:
print("Endpoint:", endpoint, type(endpoint))
if endpoint == "task":
result = transcript_group.lemur.task(
prompt=self.prompt,
final_model=self.get_final_model(self.final_model),
temperature=self.temperature,
max_output_size=self.max_output_size,
)
elif endpoint == "summary":
result = transcript_group.lemur.summarize(
final_model=self.get_final_model(self.final_model),
temperature=self.temperature,
max_output_size=self.max_output_size,
)
elif endpoint == "question-answer":
questions = self.questions.split(",")
questions = [aai.LemurQuestion(question=q) for q in questions]
result = transcript_group.lemur.question(
questions=questions,
final_model=self.get_final_model(self.final_model),
temperature=self.temperature,
max_output_size=self.max_output_size,
)
else:
self.status = transcript.error
return Data(data={"error": transcript.error})
raise ValueError(f"Endpoint not supported: {endpoint}")
return result.dict()
def get_final_model(self, model_name: str) -> aai.LemurModel:
if model_name == "claude3_5_sonnet":

View file

@ -29,16 +29,19 @@ class AssemblyAIListTranscripts(Component):
options=["all", "queued", "processing", "completed", "error"],
value="all",
info="Filter by transcript status",
advanced=True,
),
MessageTextInput(
name="created_on",
display_name="Created On",
info="Only get transcripts created on this date (YYYY-MM-DD)",
advanced=True,
),
BoolInput(
name="throttled_only",
display_name="Throttled Only",
info="Only get throttled transcripts, overrides the status filter",
advanced=True,
),
]

View file

@ -27,6 +27,7 @@ class AssemblyAITranscriptionJobPoller(Component):
display_name="Polling Interval",
value=3.0,
info="The polling interval in seconds",
advanced=True,
),
]
@ -52,7 +53,13 @@ class AssemblyAITranscriptionJobPoller(Component):
return Data(data={"error": error})
if transcript.status == aai.TranscriptStatus.completed:
data = Data(data=transcript.json_response)
json_response = transcript.json_response
text = json_response.pop("text", None)
utterances = json_response.pop("utterances", None)
transcript_id = json_response.pop("id", None)
sorted_data = {"text": text, "utterances": utterances, "id": transcript_id}
sorted_data.update(json_response)
data = Data(data=sorted_data)
self.status = data
return data
else:

View file

@ -81,18 +81,24 @@ class AssemblyAITranscriptionJobCreator(Component):
],
value="best",
info="The speech model to use for the transcription",
advanced=True,
),
BoolInput(
name="language_detection",
display_name="Automatic Language Detection",
info="Enable automatic language detection",
advanced=True,
),
MessageTextInput(
name="language_code",
display_name="Language",
info="The language of the audio file. Can be set manually if automatic language detection is disabled.\n"
"See https://www.assemblyai.com/docs/getting-started/supported-languages "
"for a list of supported language codes.",
info=(
"""
The language of the audio file. Can be set manually if automatic language detection is disabled.
See https://www.assemblyai.com/docs/getting-started/supported-languages """
"for a list of supported language codes."
),
advanced=True,
),
BoolInput(
name="speaker_labels",

View file

@ -123,6 +123,7 @@ pytest-split = "^0.9.0"
devtools = "^0.12.2"
pytest-flakefinder = "^1.1.0"
types-markdown = "^3.7.0.20240822"
assemblyai = "^0.33.0"
[tool.pytest.ini_options]
@ -244,7 +245,8 @@ dependencies = [
"crewai>=0.36.0",
"spider-client>=0.0.27",
"diskcache>=5.6.3",
"clickhouse-connect==0.7.19"
"clickhouse-connect==0.7.19",
"assemblyai>=0.33.0"
]
# Optional dependencies for uv

4
uv.lock generated
View file

@ -3520,7 +3520,7 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "assemblyai", specifier = ">=0.26.0" },
{ name = "assemblyai", specifier = ">=0.33.0" },
{ name = "astra-assistants", specifier = ">=2.1.2" },
{ name = "beautifulsoup4", specifier = ">=4.12.2" },
{ name = "boto3", specifier = "~=1.34.162" },
@ -3646,6 +3646,7 @@ source = { editable = "src/backend/base" }
dependencies = [
{ name = "aiofiles" },
{ name = "alembic" },
{ name = "assemblyai" },
{ name = "asyncer" },
{ name = "bcrypt" },
{ name = "cachetools" },
@ -3755,6 +3756,7 @@ local = [
requires-dist = [
{ name = "aiofiles", specifier = ">=24.1.0" },
{ name = "alembic", specifier = ">=1.13.0" },
{ name = "assemblyai", specifier = ">=0.33.0" },
{ name = "asyncer", specifier = ">=0.0.5" },
{ name = "bcrypt", specifier = "==4.0.1" },
{ name = "cachetools", specifier = ">=5.3.1" },