feat: Add format option with raw HTML to URL component (#3762)
Add format option with raw HTML to URL component Co-authored-by: Nadir J <31660040+NadirJ@users.noreply.github.com>
This commit is contained in:
parent
667713f6c0
commit
6f0b6830d2
2 changed files with 34 additions and 4 deletions
|
|
@ -1,10 +1,10 @@
|
|||
import re
|
||||
|
||||
from langchain_community.document_loaders.web_base import WebBaseLoader
|
||||
from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader
|
||||
|
||||
from langflow.helpers.data import data_to_text
|
||||
from langflow.custom import Component
|
||||
from langflow.io import MessageTextInput, Output
|
||||
from langflow.io import DropdownInput, MessageTextInput, Output
|
||||
from langflow.schema import Data
|
||||
from langflow.schema.message import Message
|
||||
|
||||
|
|
@ -22,6 +22,13 @@ class URLComponent(Component):
|
|||
info="Enter one or more URLs, by clicking the '+' button.",
|
||||
is_list=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="format",
|
||||
display_name="Output format",
|
||||
info="Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.",
|
||||
options=["Text", "Raw HTML"],
|
||||
value="Text",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
|
|
@ -64,7 +71,10 @@ class URLComponent(Component):
|
|||
|
||||
def fetch_content(self) -> list[Data]:
|
||||
urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]
|
||||
loader = WebBaseLoader(web_paths=urls, encoding="utf-8")
|
||||
if self.format == "Raw HTML":
|
||||
loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8")
|
||||
else:
|
||||
loader = WebBaseLoader(web_paths=urls, encoding="utf-8")
|
||||
docs = loader.load()
|
||||
data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]
|
||||
self.status = data
|
||||
|
|
|
|||
|
|
@ -200,7 +200,27 @@
|
|||
"show": true,
|
||||
"title_case": false,
|
||||
"type": "code",
|
||||
"value": "import re\n\nfrom langchain_community.document_loaders.web_base import WebBaseLoader\n\nfrom langflow.helpers.data import data_to_text\nfrom langflow.custom import Component\nfrom langflow.io import MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Fetch content from one or more URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n info=\"Enter one or more URLs, by clicking the '+' button.\",\n is_list=True,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"\n Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n raise ValueError(f\"Invalid URL: {string}\")\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n"
|
||||
"value": "import re\n\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.helpers.data import data_to_text\nfrom langflow.custom import Component\nfrom langflow.io import DropdownInput, MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Fetch content from one or more URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n info=\"Enter one or more URLs, by clicking the '+' button.\",\n is_list=True,\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output format\",\n info=\"Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.\",\n options=[\"Text\", \"Raw HTML\"],\n value=\"Text\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Text\", name=\"text\", method=\"fetch_content_text\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"\n Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n raise ValueError(f\"Invalid URL: {string}\")\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n"
|
||||
},
|
||||
"format": {
|
||||
"_input_type": "DropdownInput",
|
||||
"advanced": false,
|
||||
"combobox": false,
|
||||
"display_name": "Output format",
|
||||
"dynamic": false,
|
||||
"info": "Output format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.",
|
||||
"name": "format",
|
||||
"options": [
|
||||
"Text",
|
||||
"Raw HTML"
|
||||
],
|
||||
"placeholder": "",
|
||||
"required": false,
|
||||
"show": true,
|
||||
"title_case": false,
|
||||
"trace_as_metadata": true,
|
||||
"type": "str",
|
||||
"value": "Text"
|
||||
},
|
||||
"urls": {
|
||||
"advanced": false,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue