feat: add Spider Web Scraper & Crawler (#2439)

* spider files

* rebuild required

* add spider-client here

* Feat: Spider Web Crawler & Scraper

* Feat: spider integration

* new input not working

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes (attempt 2/3)

* fix: add outputs and configure build method

* style: run ruff

* Refactor SpiderTool to use 'crawl' instead of 'build' for generating Markdown content

* chore: add type ignore

* chore: new lock

* chore: Update mem0ai dependency to version 0.0.5

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
This commit is contained in:
William Espegren 2024-08-08 14:53:05 +02:00 committed by GitHub
commit 7a36cc9ebf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 2580 additions and 68 deletions

View file

@ -0,0 +1 @@
MODES = ["scrape", "crawl"]

View file

@ -1,6 +1,7 @@
from typing import Any
from langflow.custom import Component
from langflow.inputs.inputs import DictInput, SecretStrInput, MessageTextInput, DropdownInput
from langflow.inputs.inputs import DictInput, DropdownInput, MessageTextInput, SecretStrInput
from langflow.template.field.base import Output
@ -60,13 +61,19 @@ class AstraVectorizeComponent(Component):
name="model_name",
display_name="Model Name",
info=f"The embedding model to use for the selected provider. Each provider has a different set of models "
f"available (https://docs.datastax.com/en/astra-db-serverless/databases/embedding-generation.html):\n\n{VECTORIZE_MODELS_STR}",
f"available (full list at https://docs.datastax.com/en/astra-db-serverless/databases/embedding-generation.html):\n\n{VECTORIZE_MODELS_STR}",
required=True,
),
MessageTextInput(
name="api_key_name",
display_name="Provider API Key Name",
info="The name of the embeddings provider API key stored on Astra.",
display_name="API Key name",
info="The name of the embeddings provider API key stored on Astra. If set, it will override the 'ProviderKey' in the authentication parameters.",
),
DictInput(
name="authentication",
display_name="Authentication parameters",
is_list=True,
advanced=True,
),
SecretStrInput(
name="provider_api_key",

View file

@ -0,0 +1,121 @@
from spider.spider import Spider # type: ignore
from langflow.base.langchain_utilities.spider_constants import MODES
from langflow.custom import Component
from langflow.io import BoolInput, DictInput, DropdownInput, IntInput, Output, SecretStrInput, StrInput
from langflow.schema import Data
class SpiderTool(Component):
display_name: str = "Spider Web Crawler & Scraper"
description: str = "Spider API for web crawling and scraping."
output_types: list[str] = ["Document"]
documentation: str = "https://spider.cloud/docs/api"
inputs = [
SecretStrInput(
name="spider_api_key",
display_name="Spider API Key",
required=True,
password=True,
info="The Spider API Key, get it from https://spider.cloud",
),
StrInput(
name="url",
display_name="URL",
required=True,
info="The URL to scrape or crawl",
),
DropdownInput(
name="mode",
display_name="Mode",
required=True,
options=MODES,
value=MODES[0],
info="The mode of operation: scrape or crawl",
),
IntInput(
name="limit",
display_name="Limit",
info="The maximum amount of pages allowed to crawl per website. Set to 0 to crawl all pages.",
advanced=True,
),
IntInput(
name="depth",
display_name="Depth",
info="The crawl limit for maximum depth. If 0, no limit will be applied.",
advanced=True,
),
StrInput(
name="blacklist",
display_name="Blacklist",
info="Blacklist paths that you do not want to crawl. Use Regex patterns.",
advanced=True,
),
StrInput(
name="whitelist",
display_name="Whitelist",
info="Whitelist paths that you want to crawl, ignoring all other routes. Use Regex patterns.",
advanced=True,
),
BoolInput(
name="use_readability",
display_name="Use Readability",
info="Use readability to pre-process the content for reading.",
advanced=True,
),
IntInput(
name="request_timeout",
display_name="Request Timeout",
info="Timeout for the request in seconds.",
advanced=True,
),
BoolInput(
name="metadata",
display_name="Metadata",
info="Include metadata in the response.",
advanced=True,
),
DictInput(
name="params",
display_name="Additional Parameters",
info="Additional parameters to pass to the API. If provided, other inputs will be ignored.",
),
]
outputs = [
Output(display_name="Markdown", name="content", method="crawl"),
]
def crawl(self) -> list[Data]:
if self.params:
parameters = self.params.data
else:
parameters = {
"limit": self.limit,
"depth": self.depth,
"blacklist": self.blacklist,
"whitelist": self.whitelist,
"use_readability": self.use_readability,
"request_timeout": self.request_timeout,
"metadata": self.metadata,
"return_format": "markdown",
}
app = Spider(api_key=self.spider_api_key)
try:
if self.mode == "scrape":
parameters["limit"] = 1
result = app.scrape_url(self.url, parameters)
elif self.mode == "crawl":
result = app.crawl_url(self.url, parameters)
else:
raise ValueError(f"Invalid mode: {self.mode}. Must be 'scrape' or 'crawl'.")
except Exception as e:
raise Exception(f"Error: {str(e)}")
records = []
for record in result:
records.append(Data(data={"content": record["content"], "url": record["url"]}))
return records

View file

@ -157,7 +157,7 @@ def create_app():
raise ValueError(f"Invalid port number {prome_port_str}")
if settings.prometheus_enabled:
from prometheus_client import start_http_server
from prometheus_client import start_http_server # type: ignore
start_http_server(settings.prometheus_port)

File diff suppressed because it is too large Load diff

View file

@ -77,6 +77,10 @@ setuptools = ">=70"
nanoid = "^2.0.0"
filelock = "^3.15.4"
grandalf = "^0.8.0"
crewai = "^0.36.0"
spider-client = "^0.0.27"
[tool.poetry.extras]
deploy = ["celery", "redis", "flower"]
local = ["llama-cpp-python", "sentence-transformers", "ctransformers"]