From f1775440a546737907b478e5f3549223a3473aff Mon Sep 17 00:00:00 2001
From: Yuqi Tang <yuqi.tang@datastax.com>
Date: Fri, 30 May 2025 15:22:23 -0700
Subject: [PATCH] fix: batch run description, output, and beta badge (#8304)

* remove data to dataframe and message to data

* fix batch run component
---
 src/backend/base/langflow/components/helpers/batch_run.py | 5 ++---
 .../initial_setup/starter_projects/Youtube Analysis.json  | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/backend/base/langflow/components/helpers/batch_run.py b/src/backend/base/langflow/components/helpers/batch_run.py
index 0aaff36ab..b54523bd6 100644
--- a/src/backend/base/langflow/components/helpers/batch_run.py
+++ b/src/backend/base/langflow/components/helpers/batch_run.py
@@ -15,9 +15,8 @@ if TYPE_CHECKING:
 
 class BatchRunComponent(Component):
     display_name = "Batch Run"
-    description = "Runs an LLM over each row of a DataFrame's column. If no column is set, the entire row is passed."
+    description = "Runs an LLM on each row of a DataFrame column. If no column is specified, all columns are used."
     icon = "List"
-    beta = True
 
     inputs = [
         HandleInput(
@@ -69,7 +68,7 @@ class BatchRunComponent(Component):
 
     outputs = [
         Output(
-            display_name="DataFrame",
+            display_name="LLM Results",
             name="batch_results",
             method="run_batch",
             info="A DataFrame with all original columns plus the model's response column.",
diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Youtube Analysis.json b/src/backend/base/langflow/initial_setup/starter_projects/Youtube Analysis.json
index 3ae180c39..2ab19c703 100644
--- a/src/backend/base/langflow/initial_setup/starter_projects/Youtube Analysis.json	
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Youtube Analysis.json	
@@ -319,11 +319,11 @@
             "base_classes": [
               "DataFrame"
             ],
-            "beta": true,
+            "beta": false,
             "category": "helpers",
             "conditional_paths": [],
             "custom_fields": {},
-            "description": "Runs an LLM over each row of a DataFrame's column. If no column is set, the entire row is passed.",
+            "description": "Runs an LLM on each row of a DataFrame column. If no column is specified, all columns are used.",
             "display_name": "Batch Run",
             "documentation": "",
             "edited": false,
@@ -345,7 +345,7 @@
               {
                 "allows_loop": false,
                 "cache": true,
-                "display_name": "DataFrame",
+                "display_name": "LLM Results",
                 "method": "run_batch",
                 "name": "batch_results",
                 "selected": "DataFrame",
@@ -376,7 +376,7 @@
                 "show": true,
                 "title_case": false,
                 "type": "code",
-                "value": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any, cast\n\nimport toml  # type: ignore[import-untyped]\nfrom loguru import logger\n\nfrom langflow.custom import Component\nfrom langflow.io import BoolInput, DataFrameInput, HandleInput, MessageTextInput, MultilineInput, Output\nfrom langflow.schema import DataFrame\n\nif TYPE_CHECKING:\n    from langchain_core.runnables import Runnable\n\n\nclass BatchRunComponent(Component):\n    display_name = \"Batch Run\"\n    description = \"Runs an LLM over each row of a DataFrame's column. If no column is set, the entire row is passed.\"\n    icon = \"List\"\n    beta = True\n\n    inputs = [\n        HandleInput(\n            name=\"model\",\n            display_name=\"Language Model\",\n            info=\"Connect the 'Language Model' output from your LLM component here.\",\n            input_types=[\"LanguageModel\"],\n            required=True,\n        ),\n        MultilineInput(\n            name=\"system_message\",\n            display_name=\"Instructions\",\n            info=\"Multi-line system instruction for all rows in the DataFrame.\",\n            required=False,\n        ),\n        DataFrameInput(\n            name=\"df\",\n            display_name=\"DataFrame\",\n            info=\"The DataFrame whose column (specified by 'column_name') we'll treat as text messages.\",\n            required=True,\n        ),\n        MessageTextInput(\n            name=\"column_name\",\n            display_name=\"Column Name\",\n            info=(\n                \"The name of the DataFrame column to treat as text messages. \"\n                \"If empty, all columns will be formatted in TOML.\"\n            ),\n            required=False,\n            advanced=False,\n        ),\n        MessageTextInput(\n            name=\"output_column_name\",\n            display_name=\"Output Column Name\",\n            info=\"Name of the column where the model's response will be stored.\",\n            value=\"model_response\",\n            required=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"enable_metadata\",\n            display_name=\"Enable Metadata\",\n            info=\"If True, add metadata to the output DataFrame.\",\n            value=False,\n            required=False,\n            advanced=True,\n        ),\n    ]\n\n    outputs = [\n        Output(\n            display_name=\"DataFrame\",\n            name=\"batch_results\",\n            method=\"run_batch\",\n            info=\"A DataFrame with all original columns plus the model's response column.\",\n        ),\n    ]\n\n    def _format_row_as_toml(self, row: dict[str, Any]) -> str:\n        \"\"\"Convert a dictionary (row) into a TOML-formatted string.\"\"\"\n        formatted_dict = {str(col): {\"value\": str(val)} for col, val in row.items()}\n        return toml.dumps(formatted_dict)\n\n    def _create_base_row(\n        self, original_row: dict[str, Any], model_response: str = \"\", batch_index: int = -1\n    ) -> dict[str, Any]:\n        \"\"\"Create a base row with original columns and additional metadata.\"\"\"\n        row = original_row.copy()\n        row[self.output_column_name] = model_response\n        row[\"batch_index\"] = batch_index\n        return row\n\n    def _add_metadata(\n        self, row: dict[str, Any], *, success: bool = True, system_msg: str = \"\", error: str | None = None\n    ) -> None:\n        \"\"\"Add metadata to a row if enabled.\"\"\"\n        if not self.enable_metadata:\n            return\n\n        if success:\n            row[\"metadata\"] = {\n                \"has_system_message\": bool(system_msg),\n                \"input_length\": len(row.get(\"text_input\", \"\")),\n                \"response_length\": len(row[self.output_column_name]),\n                \"processing_status\": \"success\",\n            }\n        else:\n            row[\"metadata\"] = {\n                \"error\": error,\n                \"processing_status\": \"failed\",\n            }\n\n    async def run_batch(self) -> DataFrame:\n        \"\"\"Process each row in df[column_name] with the language model asynchronously.\n\n        Returns:\n            DataFrame: A new DataFrame containing:\n                - All original columns\n                - The model's response column (customizable name)\n                - 'batch_index' column for processing order\n                - 'metadata' (optional)\n\n        Raises:\n            ValueError: If the specified column is not found in the DataFrame\n            TypeError: If the model is not compatible or input types are wrong\n        \"\"\"\n        model: Runnable = self.model\n        system_msg = self.system_message or \"\"\n        df: DataFrame = self.df\n        col_name = self.column_name or \"\"\n\n        # Validate inputs first\n        if not isinstance(df, DataFrame):\n            msg = f\"Expected DataFrame input, got {type(df)}\"\n            raise TypeError(msg)\n\n        if col_name and col_name not in df.columns:\n            msg = f\"Column '{col_name}' not found in the DataFrame. Available columns: {', '.join(df.columns)}\"\n            raise ValueError(msg)\n\n        try:\n            # Determine text input for each row\n            if col_name:\n                user_texts = df[col_name].astype(str).tolist()\n            else:\n                user_texts = [\n                    self._format_row_as_toml(cast(dict[str, Any], row)) for row in df.to_dict(orient=\"records\")\n                ]\n\n            total_rows = len(user_texts)\n            logger.info(f\"Processing {total_rows} rows with batch run\")\n\n            # Prepare the batch of conversations\n            conversations = [\n                [{\"role\": \"system\", \"content\": system_msg}, {\"role\": \"user\", \"content\": text}]\n                if system_msg\n                else [{\"role\": \"user\", \"content\": text}]\n                for text in user_texts\n            ]\n\n            # Configure the model with project info and callbacks\n            model = model.with_config(\n                {\n                    \"run_name\": self.display_name,\n                    \"project_name\": self.get_project_name(),\n                    \"callbacks\": self.get_langchain_callbacks(),\n                }\n            )\n            # Process batches and track progress\n            responses_with_idx = list(\n                zip(\n                    range(len(conversations)),\n                    await model.abatch(list(conversations)),\n                    strict=True,\n                )\n            )\n\n            # Sort by index to maintain order\n            responses_with_idx.sort(key=lambda x: x[0])\n\n            # Build the final data with enhanced metadata\n            rows: list[dict[str, Any]] = []\n            for idx, (original_row, response) in enumerate(\n                zip(df.to_dict(orient=\"records\"), responses_with_idx, strict=False)\n            ):\n                response_text = response[1].content if hasattr(response[1], \"content\") else str(response[1])\n                row = self._create_base_row(\n                    cast(dict[str, Any], original_row), model_response=response_text, batch_index=idx\n                )\n                self._add_metadata(row, success=True, system_msg=system_msg)\n                rows.append(row)\n\n                # Log progress\n                if (idx + 1) % max(1, total_rows // 10) == 0:\n                    logger.info(f\"Processed {idx + 1}/{total_rows} rows\")\n\n            logger.info(\"Batch processing completed successfully\")\n            return DataFrame(rows)\n\n        except (KeyError, AttributeError) as e:\n            # Handle data structure and attribute access errors\n            logger.error(f\"Data processing error: {e!s}\")\n            error_row = self._create_base_row({col: \"\" for col in df.columns}, model_response=\"\", batch_index=-1)\n            self._add_metadata(error_row, success=False, error=str(e))\n            return DataFrame([error_row])\n"
+                "value": "from __future__ import annotations\n\nfrom typing import TYPE_CHECKING, Any, cast\n\nimport toml  # type: ignore[import-untyped]\nfrom loguru import logger\n\nfrom langflow.custom import Component\nfrom langflow.io import BoolInput, DataFrameInput, HandleInput, MessageTextInput, MultilineInput, Output\nfrom langflow.schema import DataFrame\n\nif TYPE_CHECKING:\n    from langchain_core.runnables import Runnable\n\n\nclass BatchRunComponent(Component):\n    display_name = \"Batch Run\"\n    description = \"Runs an LLM on each row of a DataFrame column. If no column is specified, all columns are used.\"\n    icon = \"List\"\n\n    inputs = [\n        HandleInput(\n            name=\"model\",\n            display_name=\"Language Model\",\n            info=\"Connect the 'Language Model' output from your LLM component here.\",\n            input_types=[\"LanguageModel\"],\n            required=True,\n        ),\n        MultilineInput(\n            name=\"system_message\",\n            display_name=\"Instructions\",\n            info=\"Multi-line system instruction for all rows in the DataFrame.\",\n            required=False,\n        ),\n        DataFrameInput(\n            name=\"df\",\n            display_name=\"DataFrame\",\n            info=\"The DataFrame whose column (specified by 'column_name') we'll treat as text messages.\",\n            required=True,\n        ),\n        MessageTextInput(\n            name=\"column_name\",\n            display_name=\"Column Name\",\n            info=(\n                \"The name of the DataFrame column to treat as text messages. \"\n                \"If empty, all columns will be formatted in TOML.\"\n            ),\n            required=False,\n            advanced=False,\n        ),\n        MessageTextInput(\n            name=\"output_column_name\",\n            display_name=\"Output Column Name\",\n            info=\"Name of the column where the model's response will be stored.\",\n            value=\"model_response\",\n            required=False,\n            advanced=True,\n        ),\n        BoolInput(\n            name=\"enable_metadata\",\n            display_name=\"Enable Metadata\",\n            info=\"If True, add metadata to the output DataFrame.\",\n            value=False,\n            required=False,\n            advanced=True,\n        ),\n    ]\n\n    outputs = [\n        Output(\n            display_name=\"LLM Results\",\n            name=\"batch_results\",\n            method=\"run_batch\",\n            info=\"A DataFrame with all original columns plus the model's response column.\",\n        ),\n    ]\n\n    def _format_row_as_toml(self, row: dict[str, Any]) -> str:\n        \"\"\"Convert a dictionary (row) into a TOML-formatted string.\"\"\"\n        formatted_dict = {str(col): {\"value\": str(val)} for col, val in row.items()}\n        return toml.dumps(formatted_dict)\n\n    def _create_base_row(\n        self, original_row: dict[str, Any], model_response: str = \"\", batch_index: int = -1\n    ) -> dict[str, Any]:\n        \"\"\"Create a base row with original columns and additional metadata.\"\"\"\n        row = original_row.copy()\n        row[self.output_column_name] = model_response\n        row[\"batch_index\"] = batch_index\n        return row\n\n    def _add_metadata(\n        self, row: dict[str, Any], *, success: bool = True, system_msg: str = \"\", error: str | None = None\n    ) -> None:\n        \"\"\"Add metadata to a row if enabled.\"\"\"\n        if not self.enable_metadata:\n            return\n\n        if success:\n            row[\"metadata\"] = {\n                \"has_system_message\": bool(system_msg),\n                \"input_length\": len(row.get(\"text_input\", \"\")),\n                \"response_length\": len(row[self.output_column_name]),\n                \"processing_status\": \"success\",\n            }\n        else:\n            row[\"metadata\"] = {\n                \"error\": error,\n                \"processing_status\": \"failed\",\n            }\n\n    async def run_batch(self) -> DataFrame:\n        \"\"\"Process each row in df[column_name] with the language model asynchronously.\n\n        Returns:\n            DataFrame: A new DataFrame containing:\n                - All original columns\n                - The model's response column (customizable name)\n                - 'batch_index' column for processing order\n                - 'metadata' (optional)\n\n        Raises:\n            ValueError: If the specified column is not found in the DataFrame\n            TypeError: If the model is not compatible or input types are wrong\n        \"\"\"\n        model: Runnable = self.model\n        system_msg = self.system_message or \"\"\n        df: DataFrame = self.df\n        col_name = self.column_name or \"\"\n\n        # Validate inputs first\n        if not isinstance(df, DataFrame):\n            msg = f\"Expected DataFrame input, got {type(df)}\"\n            raise TypeError(msg)\n\n        if col_name and col_name not in df.columns:\n            msg = f\"Column '{col_name}' not found in the DataFrame. Available columns: {', '.join(df.columns)}\"\n            raise ValueError(msg)\n\n        try:\n            # Determine text input for each row\n            if col_name:\n                user_texts = df[col_name].astype(str).tolist()\n            else:\n                user_texts = [\n                    self._format_row_as_toml(cast(dict[str, Any], row)) for row in df.to_dict(orient=\"records\")\n                ]\n\n            total_rows = len(user_texts)\n            logger.info(f\"Processing {total_rows} rows with batch run\")\n\n            # Prepare the batch of conversations\n            conversations = [\n                [{\"role\": \"system\", \"content\": system_msg}, {\"role\": \"user\", \"content\": text}]\n                if system_msg\n                else [{\"role\": \"user\", \"content\": text}]\n                for text in user_texts\n            ]\n\n            # Configure the model with project info and callbacks\n            model = model.with_config(\n                {\n                    \"run_name\": self.display_name,\n                    \"project_name\": self.get_project_name(),\n                    \"callbacks\": self.get_langchain_callbacks(),\n                }\n            )\n            # Process batches and track progress\n            responses_with_idx = list(\n                zip(\n                    range(len(conversations)),\n                    await model.abatch(list(conversations)),\n                    strict=True,\n                )\n            )\n\n            # Sort by index to maintain order\n            responses_with_idx.sort(key=lambda x: x[0])\n\n            # Build the final data with enhanced metadata\n            rows: list[dict[str, Any]] = []\n            for idx, (original_row, response) in enumerate(\n                zip(df.to_dict(orient=\"records\"), responses_with_idx, strict=False)\n            ):\n                response_text = response[1].content if hasattr(response[1], \"content\") else str(response[1])\n                row = self._create_base_row(\n                    cast(dict[str, Any], original_row), model_response=response_text, batch_index=idx\n                )\n                self._add_metadata(row, success=True, system_msg=system_msg)\n                rows.append(row)\n\n                # Log progress\n                if (idx + 1) % max(1, total_rows // 10) == 0:\n                    logger.info(f\"Processed {idx + 1}/{total_rows} rows\")\n\n            logger.info(\"Batch processing completed successfully\")\n            return DataFrame(rows)\n\n        except (KeyError, AttributeError) as e:\n            # Handle data structure and attribute access errors\n            logger.error(f\"Data processing error: {e!s}\")\n            error_row = self._create_base_row({col: \"\" for col in df.columns}, model_response=\"\", batch_index=-1)\n            self._add_metadata(error_row, success=False, error=str(e))\n            return DataFrame([error_row])\n"
               },
               "column_name": {
                 "_input_type": "StrInput",