feat: add Spider Web Scraper & Crawler (#2439)
* spider files * rebuild required * add spider-client here * Feat: Spider Web Crawler & Scraper * Feat: spider integration * new input not working * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * fix: add outputs and configure build method * style: run ruff * Refactor SpiderTool to use 'crawl' instead of 'build' for generating Markdown content * chore: add type ignore * chore: new lock * chore: Update mem0ai dependency to version 0.0.5 --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
This commit is contained in:
parent
a2c98b90c1
commit
7a36cc9ebf
12 changed files with 2580 additions and 68 deletions
|
|
@ -0,0 +1 @@
|
|||
MODES = ["scrape", "crawl"]
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
from typing import Any
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.inputs.inputs import DictInput, SecretStrInput, MessageTextInput, DropdownInput
|
||||
from langflow.inputs.inputs import DictInput, DropdownInput, MessageTextInput, SecretStrInput
|
||||
from langflow.template.field.base import Output
|
||||
|
||||
|
||||
|
|
@ -60,13 +61,19 @@ class AstraVectorizeComponent(Component):
|
|||
name="model_name",
|
||||
display_name="Model Name",
|
||||
info=f"The embedding model to use for the selected provider. Each provider has a different set of models "
|
||||
f"available (https://docs.datastax.com/en/astra-db-serverless/databases/embedding-generation.html):\n\n{VECTORIZE_MODELS_STR}",
|
||||
f"available (full list at https://docs.datastax.com/en/astra-db-serverless/databases/embedding-generation.html):\n\n{VECTORIZE_MODELS_STR}",
|
||||
required=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="api_key_name",
|
||||
display_name="Provider API Key Name",
|
||||
info="The name of the embeddings provider API key stored on Astra.",
|
||||
display_name="API Key name",
|
||||
info="The name of the embeddings provider API key stored on Astra. If set, it will override the 'ProviderKey' in the authentication parameters.",
|
||||
),
|
||||
DictInput(
|
||||
name="authentication",
|
||||
display_name="Authentication parameters",
|
||||
is_list=True,
|
||||
advanced=True,
|
||||
),
|
||||
SecretStrInput(
|
||||
name="provider_api_key",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,121 @@
|
|||
from spider.spider import Spider # type: ignore
|
||||
|
||||
from langflow.base.langchain_utilities.spider_constants import MODES
|
||||
from langflow.custom import Component
|
||||
from langflow.io import BoolInput, DictInput, DropdownInput, IntInput, Output, SecretStrInput, StrInput
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
class SpiderTool(Component):
|
||||
display_name: str = "Spider Web Crawler & Scraper"
|
||||
description: str = "Spider API for web crawling and scraping."
|
||||
output_types: list[str] = ["Document"]
|
||||
documentation: str = "https://spider.cloud/docs/api"
|
||||
|
||||
inputs = [
|
||||
SecretStrInput(
|
||||
name="spider_api_key",
|
||||
display_name="Spider API Key",
|
||||
required=True,
|
||||
password=True,
|
||||
info="The Spider API Key, get it from https://spider.cloud",
|
||||
),
|
||||
StrInput(
|
||||
name="url",
|
||||
display_name="URL",
|
||||
required=True,
|
||||
info="The URL to scrape or crawl",
|
||||
),
|
||||
DropdownInput(
|
||||
name="mode",
|
||||
display_name="Mode",
|
||||
required=True,
|
||||
options=MODES,
|
||||
value=MODES[0],
|
||||
info="The mode of operation: scrape or crawl",
|
||||
),
|
||||
IntInput(
|
||||
name="limit",
|
||||
display_name="Limit",
|
||||
info="The maximum amount of pages allowed to crawl per website. Set to 0 to crawl all pages.",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="depth",
|
||||
display_name="Depth",
|
||||
info="The crawl limit for maximum depth. If 0, no limit will be applied.",
|
||||
advanced=True,
|
||||
),
|
||||
StrInput(
|
||||
name="blacklist",
|
||||
display_name="Blacklist",
|
||||
info="Blacklist paths that you do not want to crawl. Use Regex patterns.",
|
||||
advanced=True,
|
||||
),
|
||||
StrInput(
|
||||
name="whitelist",
|
||||
display_name="Whitelist",
|
||||
info="Whitelist paths that you want to crawl, ignoring all other routes. Use Regex patterns.",
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="use_readability",
|
||||
display_name="Use Readability",
|
||||
info="Use readability to pre-process the content for reading.",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="request_timeout",
|
||||
display_name="Request Timeout",
|
||||
info="Timeout for the request in seconds.",
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="metadata",
|
||||
display_name="Metadata",
|
||||
info="Include metadata in the response.",
|
||||
advanced=True,
|
||||
),
|
||||
DictInput(
|
||||
name="params",
|
||||
display_name="Additional Parameters",
|
||||
info="Additional parameters to pass to the API. If provided, other inputs will be ignored.",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Markdown", name="content", method="crawl"),
|
||||
]
|
||||
|
||||
def crawl(self) -> list[Data]:
|
||||
if self.params:
|
||||
parameters = self.params.data
|
||||
else:
|
||||
parameters = {
|
||||
"limit": self.limit,
|
||||
"depth": self.depth,
|
||||
"blacklist": self.blacklist,
|
||||
"whitelist": self.whitelist,
|
||||
"use_readability": self.use_readability,
|
||||
"request_timeout": self.request_timeout,
|
||||
"metadata": self.metadata,
|
||||
"return_format": "markdown",
|
||||
}
|
||||
|
||||
app = Spider(api_key=self.spider_api_key)
|
||||
try:
|
||||
if self.mode == "scrape":
|
||||
parameters["limit"] = 1
|
||||
result = app.scrape_url(self.url, parameters)
|
||||
elif self.mode == "crawl":
|
||||
result = app.crawl_url(self.url, parameters)
|
||||
else:
|
||||
raise ValueError(f"Invalid mode: {self.mode}. Must be 'scrape' or 'crawl'.")
|
||||
except Exception as e:
|
||||
raise Exception(f"Error: {str(e)}")
|
||||
|
||||
records = []
|
||||
|
||||
for record in result:
|
||||
records.append(Data(data={"content": record["content"], "url": record["url"]}))
|
||||
return records
|
||||
|
|
@ -157,7 +157,7 @@ def create_app():
|
|||
raise ValueError(f"Invalid port number {prome_port_str}")
|
||||
|
||||
if settings.prometheus_enabled:
|
||||
from prometheus_client import start_http_server
|
||||
from prometheus_client import start_http_server # type: ignore
|
||||
|
||||
start_http_server(settings.prometheus_port)
|
||||
|
||||
|
|
|
|||
2331
src/backend/base/poetry.lock
generated
2331
src/backend/base/poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -77,6 +77,10 @@ setuptools = ">=70"
|
|||
nanoid = "^2.0.0"
|
||||
filelock = "^3.15.4"
|
||||
grandalf = "^0.8.0"
|
||||
crewai = "^0.36.0"
|
||||
spider-client = "^0.0.27"
|
||||
|
||||
|
||||
[tool.poetry.extras]
|
||||
deploy = ["celery", "redis", "flower"]
|
||||
local = ["llama-cpp-python", "sentence-transformers", "ctransformers"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue