From f1775440a546737907b478e5f3549223a3473aff Mon Sep 17 00:00:00 2001 From: Yuqi Tang Date: Fri, 30 May 2025 15:22:23 -0700 Subject: [PATCH] fix: batch run description, output, and beta badge (#8304) * remove data to dataframe and message to data * fix batch run component --- src/backend/base/langflow/components/helpers/batch_run.py | 5 ++--- .../initial_setup/starter_projects/Youtube Analysis.json | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/backend/base/langflow/components/helpers/batch_run.py b/src/backend/base/langflow/components/helpers/batch_run.py index 0aaff36ab..b54523bd6 100644 --- a/src/backend/base/langflow/components/helpers/batch_run.py +++ b/src/backend/base/langflow/components/helpers/batch_run.py @@ -15,9 +15,8 @@ if TYPE_CHECKING: class BatchRunComponent(Component): display_name = "Batch Run" - description = "Runs an LLM over each row of a DataFrame's column. If no column is set, the entire row is passed." + description = "Runs an LLM on each row of a DataFrame column. If no column is specified, all columns are used." icon = "List" - beta = True inputs = [ HandleInput( @@ -69,7 +68,7 @@ class BatchRunComponent(Component): outputs = [ Output( - display_name="DataFrame", + display_name="LLM Results", name="batch_results", method="run_batch", info="A DataFrame with all original columns plus the model's response column.", diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Youtube Analysis.json b/src/backend/base/langflow/initial_setup/starter_projects/Youtube Analysis.json index 3ae180c39..2ab19c703 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Youtube Analysis.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Youtube Analysis.json @@ -319,11 +319,11 @@ "base_classes": [ "DataFrame" ], - "beta": true, + "beta": false, "category": "helpers", "conditional_paths": [], "custom_fields": {}, - "description": "Runs an LLM over each row of a DataFrame's column. If no column is set, the entire row is passed.", + "description": "Runs an LLM on each row of a DataFrame column. If no column is specified, all columns are used.", "display_name": "Batch Run", "documentation": "", "edited": false, @@ -345,7 +345,7 @@ { "allows_loop": false, "cache": true, - "display_name": "DataFrame", + "display_name": "LLM Results", "method": "run_batch", "name": "batch_results", "selected": "DataFrame", @@ -376,7 +376,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any, cast\n\nimport toml # type: ignore[import-untyped]\nfrom loguru import logger\n\nfrom langflow.custom import Component\nfrom langflow.io import BoolInput, DataFrameInput, HandleInput, MessageTextInput, MultilineInput, Output\nfrom langflow.schema import DataFrame\n\nif TYPE_CHECKING:\n from langchain_core.runnables import Runnable\n\n\nclass BatchRunComponent(Component):\n display_name = \"Batch Run\"\n description = \"Runs an LLM over each row of a DataFrame's column. If no column is set, the entire row is passed.\"\n icon = \"List\"\n beta = True\n\n inputs = [\n HandleInput(\n name=\"model\",\n display_name=\"Language Model\",\n info=\"Connect the 'Language Model' output from your LLM component here.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"system_message\",\n display_name=\"Instructions\",\n info=\"Multi-line system instruction for all rows in the DataFrame.\",\n required=False,\n ),\n DataFrameInput(\n name=\"df\",\n display_name=\"DataFrame\",\n info=\"The DataFrame whose column (specified by 'column_name') we'll treat as text messages.\",\n required=True,\n ),\n MessageTextInput(\n name=\"column_name\",\n display_name=\"Column Name\",\n info=(\n \"The name of the DataFrame column to treat as text messages. \"\n \"If empty, all columns will be formatted in TOML.\"\n ),\n required=False,\n advanced=False,\n ),\n MessageTextInput(\n name=\"output_column_name\",\n display_name=\"Output Column Name\",\n info=\"Name of the column where the model's response will be stored.\",\n value=\"model_response\",\n required=False,\n advanced=True,\n ),\n BoolInput(\n name=\"enable_metadata\",\n display_name=\"Enable Metadata\",\n info=\"If True, add metadata to the output DataFrame.\",\n value=False,\n required=False,\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(\n display_name=\"DataFrame\",\n name=\"batch_results\",\n method=\"run_batch\",\n info=\"A DataFrame with all original columns plus the model's response column.\",\n ),\n ]\n\n def _format_row_as_toml(self, row: dict[str, Any]) -> str:\n \"\"\"Convert a dictionary (row) into a TOML-formatted string.\"\"\"\n formatted_dict = {str(col): {\"value\": str(val)} for col, val in row.items()}\n return toml.dumps(formatted_dict)\n\n def _create_base_row(\n self, original_row: dict[str, Any], model_response: str = \"\", batch_index: int = -1\n ) -> dict[str, Any]:\n \"\"\"Create a base row with original columns and additional metadata.\"\"\"\n row = original_row.copy()\n row[self.output_column_name] = model_response\n row[\"batch_index\"] = batch_index\n return row\n\n def _add_metadata(\n self, row: dict[str, Any], *, success: bool = True, system_msg: str = \"\", error: str | None = None\n ) -> None:\n \"\"\"Add metadata to a row if enabled.\"\"\"\n if not self.enable_metadata:\n return\n\n if success:\n row[\"metadata\"] = {\n \"has_system_message\": bool(system_msg),\n \"input_length\": len(row.get(\"text_input\", \"\")),\n \"response_length\": len(row[self.output_column_name]),\n \"processing_status\": \"success\",\n }\n else:\n row[\"metadata\"] = {\n \"error\": error,\n \"processing_status\": \"failed\",\n }\n\n async def run_batch(self) -> DataFrame:\n \"\"\"Process each row in df[column_name] with the language model asynchronously.\n\n Returns:\n DataFrame: A new DataFrame containing:\n - All original columns\n - The model's response column (customizable name)\n - 'batch_index' column for processing order\n - 'metadata' (optional)\n\n Raises:\n ValueError: If the specified column is not found in the DataFrame\n TypeError: If the model is not compatible or input types are wrong\n \"\"\"\n model: Runnable = self.model\n system_msg = self.system_message or \"\"\n df: DataFrame = self.df\n col_name = self.column_name or \"\"\n\n # Validate inputs first\n if not isinstance(df, DataFrame):\n msg = f\"Expected DataFrame input, got {type(df)}\"\n raise TypeError(msg)\n\n if col_name and col_name not in df.columns:\n msg = f\"Column '{col_name}' not found in the DataFrame. Available columns: {', '.join(df.columns)}\"\n raise ValueError(msg)\n\n try:\n # Determine text input for each row\n if col_name:\n user_texts = df[col_name].astype(str).tolist()\n else:\n user_texts = [\n self._format_row_as_toml(cast(dict[str, Any], row)) for row in df.to_dict(orient=\"records\")\n ]\n\n total_rows = len(user_texts)\n logger.info(f\"Processing {total_rows} rows with batch run\")\n\n # Prepare the batch of conversations\n conversations = [\n [{\"role\": \"system\", \"content\": system_msg}, {\"role\": \"user\", \"content\": text}]\n if system_msg\n else [{\"role\": \"user\", \"content\": text}]\n for text in user_texts\n ]\n\n # Configure the model with project info and callbacks\n model = model.with_config(\n {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n )\n # Process batches and track progress\n responses_with_idx = list(\n zip(\n range(len(conversations)),\n await model.abatch(list(conversations)),\n strict=True,\n )\n )\n\n # Sort by index to maintain order\n responses_with_idx.sort(key=lambda x: x[0])\n\n # Build the final data with enhanced metadata\n rows: list[dict[str, Any]] = []\n for idx, (original_row, response) in enumerate(\n zip(df.to_dict(orient=\"records\"), responses_with_idx, strict=False)\n ):\n response_text = response[1].content if hasattr(response[1], \"content\") else str(response[1])\n row = self._create_base_row(\n cast(dict[str, Any], original_row), model_response=response_text, batch_index=idx\n )\n self._add_metadata(row, success=True, system_msg=system_msg)\n rows.append(row)\n\n # Log progress\n if (idx + 1) % max(1, total_rows // 10) == 0:\n logger.info(f\"Processed {idx + 1}/{total_rows} rows\")\n\n logger.info(\"Batch processing completed successfully\")\n return DataFrame(rows)\n\n except (KeyError, AttributeError) as e:\n # Handle data structure and attribute access errors\n logger.error(f\"Data processing error: {e!s}\")\n error_row = self._create_base_row({col: \"\" for col in df.columns}, model_response=\"\", batch_index=-1)\n self._add_metadata(error_row, success=False, error=str(e))\n return DataFrame([error_row])\n" + "value": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any, cast\n\nimport toml # type: ignore[import-untyped]\nfrom loguru import logger\n\nfrom langflow.custom import Component\nfrom langflow.io import BoolInput, DataFrameInput, HandleInput, MessageTextInput, MultilineInput, Output\nfrom langflow.schema import DataFrame\n\nif TYPE_CHECKING:\n from langchain_core.runnables import Runnable\n\n\nclass BatchRunComponent(Component):\n display_name = \"Batch Run\"\n description = \"Runs an LLM on each row of a DataFrame column. If no column is specified, all columns are used.\"\n icon = \"List\"\n\n inputs = [\n HandleInput(\n name=\"model\",\n display_name=\"Language Model\",\n info=\"Connect the 'Language Model' output from your LLM component here.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"system_message\",\n display_name=\"Instructions\",\n info=\"Multi-line system instruction for all rows in the DataFrame.\",\n required=False,\n ),\n DataFrameInput(\n name=\"df\",\n display_name=\"DataFrame\",\n info=\"The DataFrame whose column (specified by 'column_name') we'll treat as text messages.\",\n required=True,\n ),\n MessageTextInput(\n name=\"column_name\",\n display_name=\"Column Name\",\n info=(\n \"The name of the DataFrame column to treat as text messages. \"\n \"If empty, all columns will be formatted in TOML.\"\n ),\n required=False,\n advanced=False,\n ),\n MessageTextInput(\n name=\"output_column_name\",\n display_name=\"Output Column Name\",\n info=\"Name of the column where the model's response will be stored.\",\n value=\"model_response\",\n required=False,\n advanced=True,\n ),\n BoolInput(\n name=\"enable_metadata\",\n display_name=\"Enable Metadata\",\n info=\"If True, add metadata to the output DataFrame.\",\n value=False,\n required=False,\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(\n display_name=\"LLM Results\",\n name=\"batch_results\",\n method=\"run_batch\",\n info=\"A DataFrame with all original columns plus the model's response column.\",\n ),\n ]\n\n def _format_row_as_toml(self, row: dict[str, Any]) -> str:\n \"\"\"Convert a dictionary (row) into a TOML-formatted string.\"\"\"\n formatted_dict = {str(col): {\"value\": str(val)} for col, val in row.items()}\n return toml.dumps(formatted_dict)\n\n def _create_base_row(\n self, original_row: dict[str, Any], model_response: str = \"\", batch_index: int = -1\n ) -> dict[str, Any]:\n \"\"\"Create a base row with original columns and additional metadata.\"\"\"\n row = original_row.copy()\n row[self.output_column_name] = model_response\n row[\"batch_index\"] = batch_index\n return row\n\n def _add_metadata(\n self, row: dict[str, Any], *, success: bool = True, system_msg: str = \"\", error: str | None = None\n ) -> None:\n \"\"\"Add metadata to a row if enabled.\"\"\"\n if not self.enable_metadata:\n return\n\n if success:\n row[\"metadata\"] = {\n \"has_system_message\": bool(system_msg),\n \"input_length\": len(row.get(\"text_input\", \"\")),\n \"response_length\": len(row[self.output_column_name]),\n \"processing_status\": \"success\",\n }\n else:\n row[\"metadata\"] = {\n \"error\": error,\n \"processing_status\": \"failed\",\n }\n\n async def run_batch(self) -> DataFrame:\n \"\"\"Process each row in df[column_name] with the language model asynchronously.\n\n Returns:\n DataFrame: A new DataFrame containing:\n - All original columns\n - The model's response column (customizable name)\n - 'batch_index' column for processing order\n - 'metadata' (optional)\n\n Raises:\n ValueError: If the specified column is not found in the DataFrame\n TypeError: If the model is not compatible or input types are wrong\n \"\"\"\n model: Runnable = self.model\n system_msg = self.system_message or \"\"\n df: DataFrame = self.df\n col_name = self.column_name or \"\"\n\n # Validate inputs first\n if not isinstance(df, DataFrame):\n msg = f\"Expected DataFrame input, got {type(df)}\"\n raise TypeError(msg)\n\n if col_name and col_name not in df.columns:\n msg = f\"Column '{col_name}' not found in the DataFrame. Available columns: {', '.join(df.columns)}\"\n raise ValueError(msg)\n\n try:\n # Determine text input for each row\n if col_name:\n user_texts = df[col_name].astype(str).tolist()\n else:\n user_texts = [\n self._format_row_as_toml(cast(dict[str, Any], row)) for row in df.to_dict(orient=\"records\")\n ]\n\n total_rows = len(user_texts)\n logger.info(f\"Processing {total_rows} rows with batch run\")\n\n # Prepare the batch of conversations\n conversations = [\n [{\"role\": \"system\", \"content\": system_msg}, {\"role\": \"user\", \"content\": text}]\n if system_msg\n else [{\"role\": \"user\", \"content\": text}]\n for text in user_texts\n ]\n\n # Configure the model with project info and callbacks\n model = model.with_config(\n {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n )\n # Process batches and track progress\n responses_with_idx = list(\n zip(\n range(len(conversations)),\n await model.abatch(list(conversations)),\n strict=True,\n )\n )\n\n # Sort by index to maintain order\n responses_with_idx.sort(key=lambda x: x[0])\n\n # Build the final data with enhanced metadata\n rows: list[dict[str, Any]] = []\n for idx, (original_row, response) in enumerate(\n zip(df.to_dict(orient=\"records\"), responses_with_idx, strict=False)\n ):\n response_text = response[1].content if hasattr(response[1], \"content\") else str(response[1])\n row = self._create_base_row(\n cast(dict[str, Any], original_row), model_response=response_text, batch_index=idx\n )\n self._add_metadata(row, success=True, system_msg=system_msg)\n rows.append(row)\n\n # Log progress\n if (idx + 1) % max(1, total_rows // 10) == 0:\n logger.info(f\"Processed {idx + 1}/{total_rows} rows\")\n\n logger.info(\"Batch processing completed successfully\")\n return DataFrame(rows)\n\n except (KeyError, AttributeError) as e:\n # Handle data structure and attribute access errors\n logger.error(f\"Data processing error: {e!s}\")\n error_row = self._create_base_row({col: \"\" for col in df.columns}, model_response=\"\", batch_index=-1)\n self._add_metadata(error_row, success=False, error=str(e))\n return DataFrame([error_row])\n" }, "column_name": { "_input_type": "StrInput",