fix: enhancements for the AssemblyAI component (#3934)
Enhancements for AssemblyAI component Co-authored-by: Patrick Loeber <98830383+ploeber@users.noreply.github.com>
This commit is contained in:
parent
d92b3cb12b
commit
d19c16462b
11 changed files with 137 additions and 116 deletions
File diff suppressed because one or more lines are too long
Binary file not shown.
|
Before Width: | Height: | Size: 392 KiB After Width: | Height: | Size: 233 KiB |
|
|
@ -63,22 +63,12 @@ This components allows you to poll the transcripts. It checks the status of the
|
|||
|
||||
- **Input**:
|
||||
- AssemblyAI API Key: Your API key.
|
||||
- Polling Interval: The polling interval in seconds. Default is 3.
|
||||
- Polling Interval (Optional): The polling interval in seconds. Default is 3.
|
||||
|
||||
- **Output**:
|
||||
- Transcription Result: The AssemblyAI JSON response of a completed transcript. Contains the text and other info.
|
||||
|
||||
|
||||
### AssebmlyAI Parse Transcript
|
||||
|
||||
This component allows you to parse a *Transcription Result* and outputs the formatted text.
|
||||
|
||||
- **Input**:
|
||||
- Transcription Result: The output of the *Poll Transcript* component.
|
||||
|
||||
- **Output**:
|
||||
- Parsed transcription: The parsed transcript. If Speaker Labels was enabled in the *Start Transcript* component, it formats utterances with speakers and timestamps.
|
||||
|
||||
### AssebmlyAI Get Subtitles
|
||||
|
||||
This component allows you to generate subtitles in SRT or VTT format.
|
||||
|
|
@ -90,7 +80,7 @@ This component allows you to generate subtitles in SRT or VTT format.
|
|||
- Character per Caption (Optional): The maximum number of characters per caption (0 for no limit).
|
||||
|
||||
- **Output**:
|
||||
- Parsed transcription: The parsed transcript. If Speaker Labels was enabled in the *Start Transcript* component, it formats utterances with speakers and timestamps.
|
||||
- Subtitles: A JSON response with the `subtitles` field containing the captions in SRT or VTT format.
|
||||
|
||||
|
||||
### AssebmlyAI LeMUR
|
||||
|
|
@ -106,6 +96,9 @@ LeMUR automatically ingests the transcript as additional context, making it easy
|
|||
- Final Model: The model that is used for the final prompt after compression is performed. Default is Claude 3.5 Sonnet.
|
||||
- Temperature (Optional): The temperature to use for the model. Default is 0.0.
|
||||
- Max Output Size (Optional): Max output size in tokens, up to 4000. Default is 2000.
|
||||
- Endpoint (Optional): The LeMUR endpoint to use. Default is "task". For "summary" and "question-answer", no prompt input is needed. See [LeMUR API docs](https://www.assemblyai.com/docs/api-reference/lemur/) for more info.
|
||||
- Questions (Optional): Comma-separated list of your questions. Only used if *Endpoint* is "question-answer".
|
||||
- Transcript IDs (Optional): Comma-separated list of transcript IDs. LeMUR can perform actions over multiple transcripts. If provided, the *Transcription Result* is ignored.
|
||||
|
||||
- **Output**:
|
||||
- LeMUR Response: The generated LLM response.
|
||||
|
|
@ -131,7 +124,7 @@ This component can be used as a standalone component to list all previously gene
|
|||
2. The user can also input an LLM prompt. In this example, we want to generate a summary of the transcript.
|
||||
3. The flow submits the audio file for transcription.
|
||||
4. The flow checks the status of the transcript every few seconds until transcription is completed.
|
||||
5. The flow parses the transcript and outputs the formatted text.
|
||||
5. The flow parses the transcription result and outputs the transcribed text.
|
||||
6. The flow also generates subtitles.
|
||||
7. The flow applies the LLM prompt to generate a summary.
|
||||
8. As a standalone component, all transcripts can be listed.
|
||||
|
|
@ -145,7 +138,7 @@ To run the Transcription and Speech AI Flow:
|
|||
3. Connect the components as shown in the flow diagram. **Tip**: Freeze the path of the *Start Transcript* component to only submit the file once.
|
||||
4. Input the AssemblyAI API key in in all components that require the key (Start Transcript, Poll Transcript, Get Subtitles, LeMUR, List Transcripts).
|
||||
5. Select an audio or video file in the *Start Transcript* component.
|
||||
6. Run the flow by clicking **Play** on the *Parse Transcript* component.
|
||||
6. Run the flow by clicking **Play** on the *Parse Data* component. Make sure that the specified template is `{text}`.
|
||||
7. To generate subtitles, click **Play** on the *Get Subtitles* component.
|
||||
8. To apply an LLM to your audio file, click **Play** on the *LeMUR* component. Note that you need an upgraded AssemblyAI account to use LeMUR.
|
||||
9. To list all transcripts, click **Play** on the *List Transcript* component.
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ dependencies = [
|
|||
"elasticsearch>=8.12.0",
|
||||
"pytube>=15.0.0",
|
||||
"dspy-ai>=2.4.0",
|
||||
"assemblyai>=0.26.0",
|
||||
"assemblyai>=0.33.0",
|
||||
"litellm>=1.44.0",
|
||||
"chromadb>=0.4",
|
||||
"langchain-anthropic>=0.1.23",
|
||||
|
|
|
|||
|
|
@ -1,65 +0,0 @@
|
|||
import datetime
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import DataInput, Output
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
class AssemblyAITranscriptionParser(Component):
|
||||
display_name = "AssemblyAI Parse Transcript"
|
||||
description = (
|
||||
"Parse AssemblyAI transcription result. "
|
||||
"If Speaker Labels was enabled, format utterances with speakers and timestamps"
|
||||
)
|
||||
documentation = "https://www.assemblyai.com/docs"
|
||||
icon = "AssemblyAI"
|
||||
|
||||
inputs = [
|
||||
DataInput(
|
||||
name="transcription_result",
|
||||
display_name="Transcription Result",
|
||||
info="The transcription result from AssemblyAI",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Parsed Transcription", name="parsed_transcription", method="parse_transcription"),
|
||||
]
|
||||
|
||||
def parse_transcription(self) -> Data:
|
||||
# check if it's an error message from the previous step
|
||||
if self.transcription_result.data.get("error"):
|
||||
self.status = self.transcription_result.data["error"]
|
||||
return self.transcription_result
|
||||
|
||||
try:
|
||||
transcription_data = self.transcription_result.data
|
||||
|
||||
if transcription_data.get("utterances"):
|
||||
# If speaker diarization was enabled
|
||||
parsed_result = self.parse_with_speakers(transcription_data["utterances"])
|
||||
elif transcription_data.get("text"):
|
||||
# If speaker diarization was not enabled
|
||||
parsed_result = transcription_data["text"]
|
||||
else:
|
||||
raise ValueError("Unexpected transcription format")
|
||||
|
||||
self.status = parsed_result
|
||||
return Data(data={"text": parsed_result})
|
||||
except Exception as e:
|
||||
error_message = f"Error parsing transcription: {str(e)}"
|
||||
self.status = error_message
|
||||
return Data(data={"error": error_message})
|
||||
|
||||
def parse_with_speakers(self, utterances: list[dict]) -> str:
|
||||
parsed_result = []
|
||||
for utterance in utterances:
|
||||
speaker = utterance["speaker"]
|
||||
start_time = self.format_timestamp(utterance["start"])
|
||||
text = utterance["text"]
|
||||
parsed_result.append(f'Speaker {speaker} {start_time}\n"{text}"\n')
|
||||
|
||||
return "\n".join(parsed_result)
|
||||
|
||||
def format_timestamp(self, milliseconds: int) -> str:
|
||||
return str(datetime.timedelta(milliseconds=milliseconds)).split(".")[0]
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import assemblyai as aai
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import DataInput, DropdownInput, FloatInput, IntInput, MessageInput, Output, SecretStrInput
|
||||
from langflow.io import DataInput, DropdownInput, FloatInput, IntInput, MultilineInput, Output, SecretStrInput
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
|
|
@ -23,7 +23,7 @@ class AssemblyAILeMUR(Component):
|
|||
display_name="Transcription Result",
|
||||
info="The transcription result from AssemblyAI",
|
||||
),
|
||||
MessageInput(
|
||||
MultilineInput(
|
||||
name="prompt",
|
||||
display_name="Input Prompt",
|
||||
info="The text to prompt the model",
|
||||
|
|
@ -34,6 +34,7 @@ class AssemblyAILeMUR(Component):
|
|||
options=["claude3_5_sonnet", "claude3_opus", "claude3_haiku", "claude3_sonnet"],
|
||||
value="claude3_5_sonnet",
|
||||
info="The model that is used for the final prompt after compression is performed",
|
||||
advanced=True,
|
||||
),
|
||||
FloatInput(
|
||||
name="temperature",
|
||||
|
|
@ -49,6 +50,32 @@ class AssemblyAILeMUR(Component):
|
|||
value=2000,
|
||||
info="Max output size in tokens, up to 4000",
|
||||
),
|
||||
DropdownInput(
|
||||
name="endpoint",
|
||||
display_name="Endpoint",
|
||||
options=["task", "summary", "question-answer"],
|
||||
value="task",
|
||||
info=(
|
||||
"The LeMUR endpoint to use. For 'summary' and 'question-answer',"
|
||||
" no prompt input is needed. See https://www.assemblyai.com/docs/api-reference/lemur/ for more info."
|
||||
),
|
||||
advanced=True,
|
||||
),
|
||||
MultilineInput(
|
||||
name="questions",
|
||||
display_name="Questions",
|
||||
info="Comma-separated list of your questions. Only used if Endpoint is 'question-answer'",
|
||||
advanced=True,
|
||||
),
|
||||
MultilineInput(
|
||||
name="transcript_ids",
|
||||
display_name="Transcript IDs",
|
||||
info=(
|
||||
"Comma-separated list of transcript IDs. LeMUR can perform actions over multiple transcripts."
|
||||
" If provided, the Transcription Result is ignored."
|
||||
),
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
|
|
@ -59,41 +86,87 @@ class AssemblyAILeMUR(Component):
|
|||
"""Use the LeMUR task endpoint to input the LLM prompt."""
|
||||
aai.settings.api_key = self.api_key
|
||||
|
||||
# check if it's an error message from the previous step
|
||||
if self.transcription_result.data.get("error"):
|
||||
if not self.transcription_result and not self.transcript_ids:
|
||||
error = "Either a Transcription Result or Transcript IDs must be provided"
|
||||
self.status = error
|
||||
return Data(data={"error": error})
|
||||
elif self.transcription_result and self.transcription_result.data.get("error"):
|
||||
# error message from the previous step
|
||||
self.status = self.transcription_result.data["error"]
|
||||
return self.transcription_result
|
||||
|
||||
if not self.prompt or not self.prompt.text:
|
||||
self.status = "No prompt specified"
|
||||
elif self.endpoint == "task" and not self.prompt:
|
||||
self.status = "No prompt specified for the task endpoint"
|
||||
return Data(data={"error": "No prompt specified"})
|
||||
|
||||
try:
|
||||
transcript = aai.Transcript.get_by_id(self.transcription_result.data["id"])
|
||||
except Exception as e:
|
||||
error = f"Getting transcription failed: {str(e)}"
|
||||
elif self.endpoint == "question-answer" and not self.questions:
|
||||
error = "No Questions were provided for the question-answer endpoint"
|
||||
self.status = error
|
||||
return Data(data={"error": error})
|
||||
|
||||
if transcript.status == aai.TranscriptStatus.completed:
|
||||
try:
|
||||
result = transcript.lemur.task(
|
||||
prompt=self.prompt.text,
|
||||
final_model=self.get_final_model(self.final_model),
|
||||
temperature=self.temperature,
|
||||
max_output_size=self.max_output_size,
|
||||
)
|
||||
# Check for valid transcripts
|
||||
transcript_ids = None
|
||||
if self.transcription_result and "id" in self.transcription_result.data:
|
||||
transcript_ids = [self.transcription_result.data["id"]]
|
||||
elif self.transcript_ids:
|
||||
transcript_ids = self.transcript_ids.split(",") or []
|
||||
transcript_ids = [t.strip() for t in transcript_ids]
|
||||
|
||||
result = Data(data=result.dict())
|
||||
self.status = result
|
||||
return result
|
||||
except Exception as e:
|
||||
error = f"An Exception happened while calling LeMUR: {str(e)}"
|
||||
self.status = error
|
||||
return Data(data={"error": error})
|
||||
if not transcript_ids:
|
||||
error = "Either a valid Transcription Result or valid Transcript IDs must be provided"
|
||||
self.status = error
|
||||
return Data(data={"error": error})
|
||||
|
||||
# Get TranscriptGroup and check if there is any error
|
||||
transcript_group = aai.TranscriptGroup(transcript_ids=transcript_ids)
|
||||
transcript_group, failures = transcript_group.wait_for_completion(return_failures=True)
|
||||
if failures:
|
||||
error = f"Getting transcriptions failed: {failures[0]}"
|
||||
self.status = error
|
||||
return Data(data={"error": error})
|
||||
|
||||
for t in transcript_group.transcripts:
|
||||
if t.status == aai.TranscriptStatus.error:
|
||||
self.status = t.error
|
||||
return Data(data={"error": t.error})
|
||||
|
||||
# Perform LeMUR action
|
||||
try:
|
||||
response = self.perform_lemur_action(transcript_group, self.endpoint)
|
||||
result = Data(data=response)
|
||||
self.status = result
|
||||
return result
|
||||
except Exception as e:
|
||||
error = f"An Error happened: {str(e)}"
|
||||
self.status = error
|
||||
return Data(data={"error": error})
|
||||
|
||||
def perform_lemur_action(self, transcript_group: aai.TranscriptGroup, endpoint: str) -> dict:
|
||||
print("Endpoint:", endpoint, type(endpoint))
|
||||
if endpoint == "task":
|
||||
result = transcript_group.lemur.task(
|
||||
prompt=self.prompt,
|
||||
final_model=self.get_final_model(self.final_model),
|
||||
temperature=self.temperature,
|
||||
max_output_size=self.max_output_size,
|
||||
)
|
||||
elif endpoint == "summary":
|
||||
result = transcript_group.lemur.summarize(
|
||||
final_model=self.get_final_model(self.final_model),
|
||||
temperature=self.temperature,
|
||||
max_output_size=self.max_output_size,
|
||||
)
|
||||
elif endpoint == "question-answer":
|
||||
questions = self.questions.split(",")
|
||||
questions = [aai.LemurQuestion(question=q) for q in questions]
|
||||
result = transcript_group.lemur.question(
|
||||
questions=questions,
|
||||
final_model=self.get_final_model(self.final_model),
|
||||
temperature=self.temperature,
|
||||
max_output_size=self.max_output_size,
|
||||
)
|
||||
else:
|
||||
self.status = transcript.error
|
||||
return Data(data={"error": transcript.error})
|
||||
raise ValueError(f"Endpoint not supported: {endpoint}")
|
||||
|
||||
return result.dict()
|
||||
|
||||
def get_final_model(self, model_name: str) -> aai.LemurModel:
|
||||
if model_name == "claude3_5_sonnet":
|
||||
|
|
|
|||
|
|
@ -29,16 +29,19 @@ class AssemblyAIListTranscripts(Component):
|
|||
options=["all", "queued", "processing", "completed", "error"],
|
||||
value="all",
|
||||
info="Filter by transcript status",
|
||||
advanced=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="created_on",
|
||||
display_name="Created On",
|
||||
info="Only get transcripts created on this date (YYYY-MM-DD)",
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="throttled_only",
|
||||
display_name="Throttled Only",
|
||||
info="Only get throttled transcripts, overrides the status filter",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ class AssemblyAITranscriptionJobPoller(Component):
|
|||
display_name="Polling Interval",
|
||||
value=3.0,
|
||||
info="The polling interval in seconds",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
|
@ -52,7 +53,13 @@ class AssemblyAITranscriptionJobPoller(Component):
|
|||
return Data(data={"error": error})
|
||||
|
||||
if transcript.status == aai.TranscriptStatus.completed:
|
||||
data = Data(data=transcript.json_response)
|
||||
json_response = transcript.json_response
|
||||
text = json_response.pop("text", None)
|
||||
utterances = json_response.pop("utterances", None)
|
||||
transcript_id = json_response.pop("id", None)
|
||||
sorted_data = {"text": text, "utterances": utterances, "id": transcript_id}
|
||||
sorted_data.update(json_response)
|
||||
data = Data(data=sorted_data)
|
||||
self.status = data
|
||||
return data
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -81,18 +81,24 @@ class AssemblyAITranscriptionJobCreator(Component):
|
|||
],
|
||||
value="best",
|
||||
info="The speech model to use for the transcription",
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="language_detection",
|
||||
display_name="Automatic Language Detection",
|
||||
info="Enable automatic language detection",
|
||||
advanced=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="language_code",
|
||||
display_name="Language",
|
||||
info="The language of the audio file. Can be set manually if automatic language detection is disabled.\n"
|
||||
"See https://www.assemblyai.com/docs/getting-started/supported-languages "
|
||||
"for a list of supported language codes.",
|
||||
info=(
|
||||
"""
|
||||
The language of the audio file. Can be set manually if automatic language detection is disabled.
|
||||
See https://www.assemblyai.com/docs/getting-started/supported-languages """
|
||||
"for a list of supported language codes."
|
||||
),
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="speaker_labels",
|
||||
|
|
|
|||
|
|
@ -123,6 +123,7 @@ pytest-split = "^0.9.0"
|
|||
devtools = "^0.12.2"
|
||||
pytest-flakefinder = "^1.1.0"
|
||||
types-markdown = "^3.7.0.20240822"
|
||||
assemblyai = "^0.33.0"
|
||||
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
|
|
@ -244,7 +245,8 @@ dependencies = [
|
|||
"crewai>=0.36.0",
|
||||
"spider-client>=0.0.27",
|
||||
"diskcache>=5.6.3",
|
||||
"clickhouse-connect==0.7.19"
|
||||
"clickhouse-connect==0.7.19",
|
||||
"assemblyai>=0.33.0"
|
||||
]
|
||||
|
||||
# Optional dependencies for uv
|
||||
|
|
|
|||
4
uv.lock
generated
4
uv.lock
generated
|
|
@ -3520,7 +3520,7 @@ dev = [
|
|||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "assemblyai", specifier = ">=0.26.0" },
|
||||
{ name = "assemblyai", specifier = ">=0.33.0" },
|
||||
{ name = "astra-assistants", specifier = ">=2.1.2" },
|
||||
{ name = "beautifulsoup4", specifier = ">=4.12.2" },
|
||||
{ name = "boto3", specifier = "~=1.34.162" },
|
||||
|
|
@ -3646,6 +3646,7 @@ source = { editable = "src/backend/base" }
|
|||
dependencies = [
|
||||
{ name = "aiofiles" },
|
||||
{ name = "alembic" },
|
||||
{ name = "assemblyai" },
|
||||
{ name = "asyncer" },
|
||||
{ name = "bcrypt" },
|
||||
{ name = "cachetools" },
|
||||
|
|
@ -3755,6 +3756,7 @@ local = [
|
|||
requires-dist = [
|
||||
{ name = "aiofiles", specifier = ">=24.1.0" },
|
||||
{ name = "alembic", specifier = ">=1.13.0" },
|
||||
{ name = "assemblyai", specifier = ">=0.33.0" },
|
||||
{ name = "asyncer", specifier = ">=0.0.5" },
|
||||
{ name = "bcrypt", specifier = "==4.0.1" },
|
||||
{ name = "cachetools", specifier = ">=5.3.1" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue