From da24c94786bc6ca8c47b15620012b280bfbebc48 Mon Sep 17 00:00:00 2001 From: Simon Duncan Date: Mon, 17 Mar 2025 14:33:35 -0500 Subject: [PATCH] fix: typo, update description for url.py (#6010) * Fix typo, update description for url.py - Change "retrive" to "retrieve" - Change description to remove redundancy * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes * fix outdated components --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida Co-authored-by: cristhianzl Co-authored-by: Mike Fortman --- .../starter_projects/Custom Component Maker.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Custom Component Maker.json b/src/backend/base/langflow/initial_setup/starter_projects/Custom Component Maker.json index 81d1031eb..6b36dcd1e 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Custom Component Maker.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Custom Component Maker.json @@ -1490,7 +1490,7 @@ "show": true, "title_case": false, "type": "code", - "value": "import asyncio\nimport json\nimport re\n\nimport aiohttp\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.custom import Component\nfrom langflow.io import BoolInput, DropdownInput, MessageTextInput, Output, StrInput\nfrom langflow.schema import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = (\n \"Load and retrieve data from specified URLs. Supports output in plain text, raw HTML, \"\n \"or JSON, with options for cleaning and separating multiple outputs.\"\n )\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n is_list=True,\n tool_mode=True,\n placeholder=\"Enter a URL...\",\n list_add_label=\"Add URL\",\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output Format\",\n info=(\n \"Output Format. Use 'Text' to extract text from the HTML, 'Raw HTML' for the raw HTML \"\n \"content, or 'JSON' to extract JSON from the HTML.\"\n ),\n options=[\"Text\", \"Raw HTML\", \"JSON\"],\n value=\"Text\",\n real_time_refresh=True,\n ),\n StrInput(\n name=\"separator\",\n display_name=\"Separator\",\n value=\"\\n\\n\",\n show=True,\n info=(\n \"Specify the separator to use between multiple outputs. Default for Text is '\\\\n\\\\n'. \"\n \"Default for Raw HTML is '\\\\n\\\\n'.\"\n ),\n ),\n BoolInput(\n name=\"clean_extra_whitespace\",\n display_name=\"Clean Extra Whitespace\",\n value=True,\n show=True,\n info=\"Whether to clean excessive blank lines in the text output. Only applies to 'Text' format.\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n async def validate_json_content(self, url: str) -> bool:\n \"\"\"Validates if the URL content is actually JSON.\"\"\"\n try:\n async with aiohttp.ClientSession() as session, session.get(url) as response:\n http_ok = 200\n if response.status != http_ok:\n return False\n\n content = await response.text()\n try:\n json.loads(content)\n except json.JSONDecodeError:\n return False\n else:\n return True\n except (aiohttp.ClientError, asyncio.TimeoutError):\n # Log specific error for debugging if needed\n return False\n\n def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update fields based on selected format.\"\"\"\n if field_name == \"format\":\n is_text_mode = field_value == \"Text\"\n is_json_mode = field_value == \"JSON\"\n build_config[\"separator\"][\"value\"] = \"\\n\\n\" if is_text_mode else \"\\n\\n\"\n build_config[\"clean_extra_whitespace\"][\"show\"] = is_text_mode\n build_config[\"separator\"][\"show\"] = not is_json_mode\n return build_config\n\n def ensure_url(self, string: str) -> str:\n \"\"\"Ensures the given string is a valid URL.\"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\"\n r\"(www\\.)?\"\n r\"([a-zA-Z0-9.-]+)\"\n r\"(\\.[a-zA-Z]{2,})?\"\n r\"(:\\d+)?\"\n r\"(\\/[^\\s]*)?$\",\n re.IGNORECASE,\n )\n\n error_msg = \"Invalid URL - \" + string\n if not url_regex.match(string):\n raise ValueError(error_msg)\n\n return string\n\n def fetch_content(self) -> list[Data]:\n \"\"\"Fetch content based on selected format.\"\"\"\n urls = list({self.ensure_url(url.strip()) for url in self.urls if url.strip()})\n\n no_urls_msg = \"No valid URLs provided.\"\n if not urls:\n raise ValueError(no_urls_msg)\n\n # If JSON format is selected, validate JSON content first\n if self.format == \"JSON\":\n for url in urls:\n is_json = asyncio.run(self.validate_json_content(url))\n if not is_json:\n error_msg = \"Invalid JSON content from URL - \" + url\n raise ValueError(error_msg)\n\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n\n docs = loader.load()\n\n if self.format == \"JSON\":\n data = []\n for doc in docs:\n try:\n json_content = json.loads(doc.page_content)\n data_dict = {\"text\": json.dumps(json_content, indent=2), **json_content, **doc.metadata}\n data.append(Data(**data_dict))\n except json.JSONDecodeError as err:\n source = doc.metadata.get(\"source\", \"unknown URL\")\n error_msg = \"Invalid JSON content from \" + source\n raise ValueError(error_msg) from err\n return data\n\n return [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n\n def fetch_content_text(self) -> Message:\n \"\"\"Fetch content and return as formatted text.\"\"\"\n data = self.fetch_content()\n\n if self.format == \"JSON\":\n text_list = [item.text for item in data]\n result = \"\\n\".join(text_list)\n else:\n text_list = [item.text for item in data]\n if self.format == \"Text\" and self.clean_extra_whitespace:\n text_list = [re.sub(r\"\\n{3,}\", \"\\n\\n\", text) for text in text_list]\n result = self.separator.join(text_list)\n\n self.status = result\n return Message(text=result)\n\n def as_dataframe(self) -> DataFrame:\n \"\"\"Return fetched content as a DataFrame.\"\"\"\n return DataFrame(self.fetch_content())\n" + "value": "import re\n\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.custom import Component\nfrom langflow.helpers.data import data_to_text\nfrom langflow.io import DropdownInput, MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Load and retrive data from specified URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n is_list=True,\n tool_mode=True,\n placeholder=\"Enter a URL...\",\n list_add_label=\"Add URL\",\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output Format\",\n info=\"Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.\",\n options=[\"Text\", \"Raw HTML\"],\n value=\"Text\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Message\", name=\"text\", method=\"fetch_content_text\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n msg = f\"Invalid URL: {string}\"\n raise ValueError(msg)\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n\n def as_dataframe(self) -> DataFrame:\n return DataFrame(self.fetch_content())\n" }, "format": { "_input_type": "DropdownInput", @@ -1684,7 +1684,7 @@ "show": true, "title_case": false, "type": "code", - "value": "import asyncio\nimport json\nimport re\n\nimport aiohttp\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.custom import Component\nfrom langflow.io import BoolInput, DropdownInput, MessageTextInput, Output, StrInput\nfrom langflow.schema import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = (\n \"Load and retrieve data from specified URLs. Supports output in plain text, raw HTML, \"\n \"or JSON, with options for cleaning and separating multiple outputs.\"\n )\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n is_list=True,\n tool_mode=True,\n placeholder=\"Enter a URL...\",\n list_add_label=\"Add URL\",\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output Format\",\n info=(\n \"Output Format. Use 'Text' to extract text from the HTML, 'Raw HTML' for the raw HTML \"\n \"content, or 'JSON' to extract JSON from the HTML.\"\n ),\n options=[\"Text\", \"Raw HTML\", \"JSON\"],\n value=\"Text\",\n real_time_refresh=True,\n ),\n StrInput(\n name=\"separator\",\n display_name=\"Separator\",\n value=\"\\n\\n\",\n show=True,\n info=(\n \"Specify the separator to use between multiple outputs. Default for Text is '\\\\n\\\\n'. \"\n \"Default for Raw HTML is '\\\\n\\\\n'.\"\n ),\n ),\n BoolInput(\n name=\"clean_extra_whitespace\",\n display_name=\"Clean Extra Whitespace\",\n value=True,\n show=True,\n info=\"Whether to clean excessive blank lines in the text output. Only applies to 'Text' format.\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n async def validate_json_content(self, url: str) -> bool:\n \"\"\"Validates if the URL content is actually JSON.\"\"\"\n try:\n async with aiohttp.ClientSession() as session, session.get(url) as response:\n http_ok = 200\n if response.status != http_ok:\n return False\n\n content = await response.text()\n try:\n json.loads(content)\n except json.JSONDecodeError:\n return False\n else:\n return True\n except (aiohttp.ClientError, asyncio.TimeoutError):\n # Log specific error for debugging if needed\n return False\n\n def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update fields based on selected format.\"\"\"\n if field_name == \"format\":\n is_text_mode = field_value == \"Text\"\n is_json_mode = field_value == \"JSON\"\n build_config[\"separator\"][\"value\"] = \"\\n\\n\" if is_text_mode else \"\\n\\n\"\n build_config[\"clean_extra_whitespace\"][\"show\"] = is_text_mode\n build_config[\"separator\"][\"show\"] = not is_json_mode\n return build_config\n\n def ensure_url(self, string: str) -> str:\n \"\"\"Ensures the given string is a valid URL.\"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\"\n r\"(www\\.)?\"\n r\"([a-zA-Z0-9.-]+)\"\n r\"(\\.[a-zA-Z]{2,})?\"\n r\"(:\\d+)?\"\n r\"(\\/[^\\s]*)?$\",\n re.IGNORECASE,\n )\n\n error_msg = \"Invalid URL - \" + string\n if not url_regex.match(string):\n raise ValueError(error_msg)\n\n return string\n\n def fetch_content(self) -> list[Data]:\n \"\"\"Fetch content based on selected format.\"\"\"\n urls = list({self.ensure_url(url.strip()) for url in self.urls if url.strip()})\n\n no_urls_msg = \"No valid URLs provided.\"\n if not urls:\n raise ValueError(no_urls_msg)\n\n # If JSON format is selected, validate JSON content first\n if self.format == \"JSON\":\n for url in urls:\n is_json = asyncio.run(self.validate_json_content(url))\n if not is_json:\n error_msg = \"Invalid JSON content from URL - \" + url\n raise ValueError(error_msg)\n\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n\n docs = loader.load()\n\n if self.format == \"JSON\":\n data = []\n for doc in docs:\n try:\n json_content = json.loads(doc.page_content)\n data_dict = {\"text\": json.dumps(json_content, indent=2), **json_content, **doc.metadata}\n data.append(Data(**data_dict))\n except json.JSONDecodeError as err:\n source = doc.metadata.get(\"source\", \"unknown URL\")\n error_msg = \"Invalid JSON content from \" + source\n raise ValueError(error_msg) from err\n return data\n\n return [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n\n def fetch_content_text(self) -> Message:\n \"\"\"Fetch content and return as formatted text.\"\"\"\n data = self.fetch_content()\n\n if self.format == \"JSON\":\n text_list = [item.text for item in data]\n result = \"\\n\".join(text_list)\n else:\n text_list = [item.text for item in data]\n if self.format == \"Text\" and self.clean_extra_whitespace:\n text_list = [re.sub(r\"\\n{3,}\", \"\\n\\n\", text) for text in text_list]\n result = self.separator.join(text_list)\n\n self.status = result\n return Message(text=result)\n\n def as_dataframe(self) -> DataFrame:\n \"\"\"Return fetched content as a DataFrame.\"\"\"\n return DataFrame(self.fetch_content())\n" + "value": "import re\n\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.custom import Component\nfrom langflow.helpers.data import data_to_text\nfrom langflow.io import DropdownInput, MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Load and retrive data from specified URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n is_list=True,\n tool_mode=True,\n placeholder=\"Enter a URL...\",\n list_add_label=\"Add URL\",\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output Format\",\n info=\"Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.\",\n options=[\"Text\", \"Raw HTML\"],\n value=\"Text\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Message\", name=\"text\", method=\"fetch_content_text\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n msg = f\"Invalid URL: {string}\"\n raise ValueError(msg)\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n\n def as_dataframe(self) -> DataFrame:\n return DataFrame(self.fetch_content())\n" }, "format": { "_input_type": "DropdownInput", @@ -1884,7 +1884,7 @@ "show": true, "title_case": false, "type": "code", - "value": "import asyncio\nimport json\nimport re\n\nimport aiohttp\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.custom import Component\nfrom langflow.io import BoolInput, DropdownInput, MessageTextInput, Output, StrInput\nfrom langflow.schema import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = (\n \"Load and retrieve data from specified URLs. Supports output in plain text, raw HTML, \"\n \"or JSON, with options for cleaning and separating multiple outputs.\"\n )\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n is_list=True,\n tool_mode=True,\n placeholder=\"Enter a URL...\",\n list_add_label=\"Add URL\",\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output Format\",\n info=(\n \"Output Format. Use 'Text' to extract text from the HTML, 'Raw HTML' for the raw HTML \"\n \"content, or 'JSON' to extract JSON from the HTML.\"\n ),\n options=[\"Text\", \"Raw HTML\", \"JSON\"],\n value=\"Text\",\n real_time_refresh=True,\n ),\n StrInput(\n name=\"separator\",\n display_name=\"Separator\",\n value=\"\\n\\n\",\n show=True,\n info=(\n \"Specify the separator to use between multiple outputs. Default for Text is '\\\\n\\\\n'. \"\n \"Default for Raw HTML is '\\\\n\\\\n'.\"\n ),\n ),\n BoolInput(\n name=\"clean_extra_whitespace\",\n display_name=\"Clean Extra Whitespace\",\n value=True,\n show=True,\n info=\"Whether to clean excessive blank lines in the text output. Only applies to 'Text' format.\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n async def validate_json_content(self, url: str) -> bool:\n \"\"\"Validates if the URL content is actually JSON.\"\"\"\n try:\n async with aiohttp.ClientSession() as session, session.get(url) as response:\n http_ok = 200\n if response.status != http_ok:\n return False\n\n content = await response.text()\n try:\n json.loads(content)\n except json.JSONDecodeError:\n return False\n else:\n return True\n except (aiohttp.ClientError, asyncio.TimeoutError):\n # Log specific error for debugging if needed\n return False\n\n def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Dynamically update fields based on selected format.\"\"\"\n if field_name == \"format\":\n is_text_mode = field_value == \"Text\"\n is_json_mode = field_value == \"JSON\"\n build_config[\"separator\"][\"value\"] = \"\\n\\n\" if is_text_mode else \"\\n\\n\"\n build_config[\"clean_extra_whitespace\"][\"show\"] = is_text_mode\n build_config[\"separator\"][\"show\"] = not is_json_mode\n return build_config\n\n def ensure_url(self, string: str) -> str:\n \"\"\"Ensures the given string is a valid URL.\"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\"\n r\"(www\\.)?\"\n r\"([a-zA-Z0-9.-]+)\"\n r\"(\\.[a-zA-Z]{2,})?\"\n r\"(:\\d+)?\"\n r\"(\\/[^\\s]*)?$\",\n re.IGNORECASE,\n )\n\n error_msg = \"Invalid URL - \" + string\n if not url_regex.match(string):\n raise ValueError(error_msg)\n\n return string\n\n def fetch_content(self) -> list[Data]:\n \"\"\"Fetch content based on selected format.\"\"\"\n urls = list({self.ensure_url(url.strip()) for url in self.urls if url.strip()})\n\n no_urls_msg = \"No valid URLs provided.\"\n if not urls:\n raise ValueError(no_urls_msg)\n\n # If JSON format is selected, validate JSON content first\n if self.format == \"JSON\":\n for url in urls:\n is_json = asyncio.run(self.validate_json_content(url))\n if not is_json:\n error_msg = \"Invalid JSON content from URL - \" + url\n raise ValueError(error_msg)\n\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n\n docs = loader.load()\n\n if self.format == \"JSON\":\n data = []\n for doc in docs:\n try:\n json_content = json.loads(doc.page_content)\n data_dict = {\"text\": json.dumps(json_content, indent=2), **json_content, **doc.metadata}\n data.append(Data(**data_dict))\n except json.JSONDecodeError as err:\n source = doc.metadata.get(\"source\", \"unknown URL\")\n error_msg = \"Invalid JSON content from \" + source\n raise ValueError(error_msg) from err\n return data\n\n return [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n\n def fetch_content_text(self) -> Message:\n \"\"\"Fetch content and return as formatted text.\"\"\"\n data = self.fetch_content()\n\n if self.format == \"JSON\":\n text_list = [item.text for item in data]\n result = \"\\n\".join(text_list)\n else:\n text_list = [item.text for item in data]\n if self.format == \"Text\" and self.clean_extra_whitespace:\n text_list = [re.sub(r\"\\n{3,}\", \"\\n\\n\", text) for text in text_list]\n result = self.separator.join(text_list)\n\n self.status = result\n return Message(text=result)\n\n def as_dataframe(self) -> DataFrame:\n \"\"\"Return fetched content as a DataFrame.\"\"\"\n return DataFrame(self.fetch_content())\n" + "value": "import re\n\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.custom import Component\nfrom langflow.helpers.data import data_to_text\nfrom langflow.io import DropdownInput, MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Load and retrive data from specified URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n is_list=True,\n tool_mode=True,\n placeholder=\"Enter a URL...\",\n list_add_label=\"Add URL\",\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output Format\",\n info=\"Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.\",\n options=[\"Text\", \"Raw HTML\"],\n value=\"Text\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Message\", name=\"text\", method=\"fetch_content_text\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n msg = f\"Invalid URL: {string}\"\n raise ValueError(msg)\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n\n def as_dataframe(self) -> DataFrame:\n return DataFrame(self.fetch_content())\n" }, "format": { "_input_type": "DropdownInput",