From d4c18d3c00af186377ec73bd7a6b8ade335f2c07 Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Tue, 25 Jun 2024 17:10:29 -0300
Subject: [PATCH] added firecrawl integration
---
poetry.lock | 26 ++++--
pyproject.toml | 1 +
.../langchain_utilities/FirecrawlCrawlApi.py | 81 +++++++++++++++++++
.../langchain_utilities/FirecrawlScrapeApi.py | 72 +++++++++++++++++
src/backend/base/poetry.lock | 16 +++-
src/backend/base/pyproject.toml | 1 +
.../src/icons/Firecrawl/FirecrawlLogo.jsx | 61 ++++++++++++++
.../src/icons/Firecrawl/firecraw-logo.svg | 28 +++++++
src/frontend/src/icons/Firecrawl/index.tsx | 9 +++
src/frontend/src/utils/styleUtils.ts | 3 +
10 files changed, 291 insertions(+), 7 deletions(-)
create mode 100644 src/backend/base/langflow/components/langchain_utilities/FirecrawlCrawlApi.py
create mode 100644 src/backend/base/langflow/components/langchain_utilities/FirecrawlScrapeApi.py
create mode 100644 src/frontend/src/icons/Firecrawl/FirecrawlLogo.jsx
create mode 100644 src/frontend/src/icons/Firecrawl/firecraw-logo.svg
create mode 100644 src/frontend/src/icons/Firecrawl/index.tsx
diff --git a/poetry.lock b/poetry.lock
index 5ac1608a9..a4f0db362 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2112,6 +2112,20 @@ files = [
{file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"},
]
+[[package]]
+name = "firecrawl-py"
+version = "0.0.16"
+description = "Python SDK for Firecrawl API"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "firecrawl_py-0.0.16-py3-none-any.whl", hash = "sha256:9024f483b501852a6b9c4e6cdfc9e8dde452d922afac357080bb278a0c9c2a26"},
+ {file = "firecrawl_py-0.0.16.tar.gz", hash = "sha256:6c662fa0a549bc7f5c0acb704baba6731869ca0451094034264dfc1b4eb086e4"},
+]
+
+[package.dependencies]
+requests = "*"
+
[[package]]
name = "flaml"
version = "2.1.2"
@@ -2432,8 +2446,8 @@ files = [
[package.dependencies]
cffi = {version = ">=1.12.2", markers = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""}
greenlet = [
- {version = ">=2.0.0", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.11\""},
{version = ">=3.0rc3", markers = "platform_python_implementation == \"CPython\" and python_version >= \"3.11\""},
+ {version = ">=2.0.0", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.11\""},
]
"zope.event" = "*"
"zope.interface" = "*"
@@ -2592,12 +2606,12 @@ files = [
google-auth = ">=2.14.1,<3.0.dev0"
googleapis-common-protos = ">=1.56.2,<2.0.dev0"
grpcio = [
- {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
{version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
+ {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
]
grpcio-status = [
- {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
{version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
+ {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
]
proto-plus = ">=1.22.3,<2.0.0dev"
protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
@@ -4611,8 +4625,8 @@ psutil = ">=5.9.1"
pywin32 = {version = "*", markers = "platform_system == \"Windows\""}
pyzmq = ">=25.0.0"
requests = [
- {version = ">=2.26.0", markers = "python_version <= \"3.11\""},
{version = ">=2.32.2", markers = "python_version > \"3.11\""},
+ {version = ">=2.26.0", markers = "python_version <= \"3.11\""},
]
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
Werkzeug = ">=2.0.0"
@@ -6076,9 +6090,9 @@ files = [
[package.dependencies]
numpy = [
+ {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""},
{version = ">=1.22.4,<2", markers = "python_version < \"3.11\""},
{version = ">=1.23.2,<2", markers = "python_version == \"3.11\""},
- {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@@ -10548,4 +10562,4 @@ local = ["ctransformers", "llama-cpp-python", "sentence-transformers"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.13"
-content-hash = "f7377e3a997651cbcec2b9227b0bcde2507afc7d6236b708f4dc62857f150578"
+content-hash = "3e72b6faa1c674615a7e5dec3e7d962349e736bf6675c08a49080b7f336cc75b"
diff --git a/pyproject.toml b/pyproject.toml
index 98d00485b..0ea058281 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -92,6 +92,7 @@ unstructured = {extras = ["docx", "md", "pptx"], version = "^0.14.4"}
langchain-aws = "^0.1.6"
langchain-mongodb = "^0.1.6"
kubernetes = "^30.1.0"
+firecrawl-py = "^0.0.16"
[tool.poetry.group.dev.dependencies]
diff --git a/src/backend/base/langflow/components/langchain_utilities/FirecrawlCrawlApi.py b/src/backend/base/langflow/components/langchain_utilities/FirecrawlCrawlApi.py
new file mode 100644
index 000000000..54355285d
--- /dev/null
+++ b/src/backend/base/langflow/components/langchain_utilities/FirecrawlCrawlApi.py
@@ -0,0 +1,81 @@
+from typing import Optional
+from firecrawl.firecrawl import FirecrawlApp
+from langflow.custom import CustomComponent
+from langflow.schema import Data
+import uuid
+
+class FirecrawlCrawlApi(CustomComponent):
+ display_name: str = "FirecrawlCrawlApi"
+ description: str = "Firecrawl Crawl API."
+ output_types: list[str] = ["Document"]
+ documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl"
+ field_config = {
+ "api_key": {
+ "display_name": "API Key",
+ "field_type": "str",
+ "required": True,
+ "password": True,
+ "info": "The API key to use Firecrawl API.",
+ },
+ "url": {
+ "display_name": "URL",
+ "field_type": "str",
+ "required": True,
+ "info": "The base URL to start crawling from.",
+ },
+ "timeout": {
+ "display_name": "Timeout",
+ "field_type": "int",
+ "info": "The timeout in milliseconds.",
+ },
+ "crawlerOptions": {
+ "display_name": "Crawler Options",
+ "info": "Options for the crawler behavior.",
+ },
+ "pageOptions": {
+ "display_name": "Page Options",
+ "info": "The page options to send with the request.",
+ },
+ "idempotency_key": {
+ "display_name": "Idempotency Key",
+ "field_type": "str",
+ "info": "Optional idempotency key to ensure unique requests.",
+ },
+ }
+
+ def build(
+ self,
+ api_key: str,
+ url: str,
+ timeout: Optional[int] = 30000,
+ crawlerOptions: Optional[Data] = None,
+ pageOptions: Optional[Data] = None,
+ idempotency_key: Optional[str] = None,
+ ) -> Data:
+ if crawlerOptions:
+ crawler_options_dict = crawlerOptions.__dict__['data']['text']
+ else:
+ crawler_options_dict = {}
+
+ if pageOptions:
+ page_options_dict = pageOptions.__dict__['data']['text']
+ else:
+ page_options_dict = {}
+
+ if not idempotency_key:
+ idempotency_key = str(uuid.uuid4())
+
+ app = FirecrawlApp(api_key=api_key)
+ crawl_result = app.crawl_url(
+ url,
+ {
+ "crawlerOptions": crawler_options_dict,
+ "pageOptions": page_options_dict,
+ },
+ True,
+ int(timeout / 1000),
+ idempotency_key
+ )
+
+ records = Data(data={"results": crawl_result})
+ return records
diff --git a/src/backend/base/langflow/components/langchain_utilities/FirecrawlScrapeApi.py b/src/backend/base/langflow/components/langchain_utilities/FirecrawlScrapeApi.py
new file mode 100644
index 000000000..5cdfc82ba
--- /dev/null
+++ b/src/backend/base/langflow/components/langchain_utilities/FirecrawlScrapeApi.py
@@ -0,0 +1,72 @@
+from typing import Optional
+from firecrawl.firecrawl import FirecrawlApp
+from langflow.custom import CustomComponent
+from langflow.schema import Data
+from langflow.services.database.models.base import orjson_dumps
+import json
+
+class FirecrawlScrapeApi(CustomComponent):
+ display_name: str = "FirecrawlScrapeApi"
+ description: str = "Firecrawl Scrape API."
+ output_types: list[str] = ["Document"]
+ documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/scrape"
+ field_config = {
+ "api_key": {
+ "display_name": "API Key",
+ "field_type": "str",
+ "required": True,
+ "password": True,
+ "info": "The API key to use Firecrawl API.",
+ },
+ "url": {
+ "display_name": "URL",
+ "field_type": "str",
+ "required": True,
+ "info": "The URL to scrape.",
+ },
+ "timeout": {
+ "display_name": "Timeout",
+ "info": "Timeout in milliseconds for the request.",
+ "field_type": "int",
+ "default_value": 10000,
+ },
+ "pageOptions": {
+ "display_name": "Page Options",
+ "info": "The page options to send with the request.",
+ },
+ "extractorOptions": {
+ "display_name": "Extractor Options",
+ "info": "The extractor options to send with the request.",
+ },
+ }
+
+ def build(
+ self,
+ api_key: str,
+ url: str,
+ timeout: Optional[int] = 10000,
+ pageOptions: Optional[Data] = None,
+ extractorOptions: Optional[Data] = None,
+ ) -> Data:
+ if extractorOptions:
+ extractor_options_dict = extractorOptions.__dict__['data']['text']
+ else:
+ extractor_options_dict = {}
+
+ if pageOptions:
+ page_options_dict = pageOptions.__dict__['data']['text']
+ else:
+ page_options_dict = {}
+
+ app = FirecrawlApp(api_key=api_key)
+ results = app.scrape_url(
+ url,
+ {
+ "timeout": str(timeout),
+ "extractorOptions": extractor_options_dict,
+ "pageOptions": page_options_dict,
+ },
+ )
+
+ record = Data(data=results)
+ return record
diff --git a/src/backend/base/poetry.lock b/src/backend/base/poetry.lock
index 379821ebf..256a5eef9 100644
--- a/src/backend/base/poetry.lock
+++ b/src/backend/base/poetry.lock
@@ -739,6 +739,20 @@ typer = ">=0.12.3"
[package.extras]
standard = ["fastapi", "uvicorn[standard] (>=0.15.0)"]
+[[package]]
+name = "firecrawl-py"
+version = "0.0.16"
+description = "Python SDK for Firecrawl API"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "firecrawl_py-0.0.16-py3-none-any.whl", hash = "sha256:9024f483b501852a6b9c4e6cdfc9e8dde452d922afac357080bb278a0c9c2a26"},
+ {file = "firecrawl_py-0.0.16.tar.gz", hash = "sha256:6c662fa0a549bc7f5c0acb704baba6731869ca0451094034264dfc1b4eb086e4"},
+]
+
+[package.dependencies]
+requests = "*"
+
[[package]]
name = "frozenlist"
version = "1.4.1"
@@ -3232,4 +3246,4 @@ local = []
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.13"
-content-hash = "4f566531a8539ddc81cb91a7e7f9b723c84679f0af5bb8619f7b02f9ffc6cfaa"
+content-hash = "7e46144d27c633214f00e73e496c0e4d56db1fb47032a21861677ec275b79d86"
diff --git a/src/backend/base/pyproject.toml b/src/backend/base/pyproject.toml
index ee8072a05..c1eab2bb9 100644
--- a/src/backend/base/pyproject.toml
+++ b/src/backend/base/pyproject.toml
@@ -64,6 +64,7 @@ pyperclip = "^1.8.2"
uncurl = "^0.0.11"
sentry-sdk = {extras = ["fastapi", "loguru"], version = "^2.5.1"}
chardet = "^5.2.0"
+firecrawl-py = "^0.0.16"
[tool.poetry.extras]
diff --git a/src/frontend/src/icons/Firecrawl/FirecrawlLogo.jsx b/src/frontend/src/icons/Firecrawl/FirecrawlLogo.jsx
new file mode 100644
index 000000000..f79173d13
--- /dev/null
+++ b/src/frontend/src/icons/Firecrawl/FirecrawlLogo.jsx
@@ -0,0 +1,61 @@
+const SvgFirecrawlLogo = (props) => (
+
+);
+export default SvgFirecrawlLogo;
diff --git a/src/frontend/src/icons/Firecrawl/firecraw-logo.svg b/src/frontend/src/icons/Firecrawl/firecraw-logo.svg
new file mode 100644
index 000000000..ea8d2c7f1
--- /dev/null
+++ b/src/frontend/src/icons/Firecrawl/firecraw-logo.svg
@@ -0,0 +1,28 @@
+
\ No newline at end of file
diff --git a/src/frontend/src/icons/Firecrawl/index.tsx b/src/frontend/src/icons/Firecrawl/index.tsx
new file mode 100644
index 000000000..060d053bb
--- /dev/null
+++ b/src/frontend/src/icons/Firecrawl/index.tsx
@@ -0,0 +1,9 @@
+import React, { forwardRef } from "react";
+import SvgFirecrawlLogo from "./FirecrawlLogo";
+
+export const FirecrawlIcon = forwardRef<
+ SVGSVGElement,
+ React.PropsWithChildren<{}>
+>((props, ref) => {
+ return ;
+});
diff --git a/src/frontend/src/utils/styleUtils.ts b/src/frontend/src/utils/styleUtils.ts
index 30a22fd60..b344eaaec 100644
--- a/src/frontend/src/utils/styleUtils.ts
+++ b/src/frontend/src/utils/styleUtils.ts
@@ -171,6 +171,7 @@ import { CouchbaseIcon } from "../icons/Couchbase";
import { ElasticsearchIcon } from "../icons/ElasticsearchStore";
import { EvernoteIcon } from "../icons/Evernote";
import { FBIcon } from "../icons/FacebookMessenger";
+import { FirecrawlIcon } from "../icons/Firecrawl";
import { GitBookIcon } from "../icons/GitBook";
import { GoogleIcon } from "../icons/Google";
import { GoogleGenerativeAIIcon } from "../icons/GoogleGenerativeAI";
@@ -363,6 +364,8 @@ export const nodeIconsLucide: iconsType = {
CohereEmbeddings: CohereIcon,
EverNoteLoader: EvernoteIcon,
FacebookChatLoader: FBIcon,
+ FirecrawlCrawlApi: FirecrawlIcon,
+ FirecrawlScrapeApi: FirecrawlIcon,
GitbookLoader: GitBookIcon,
GoogleSearchAPIWrapper: GoogleIcon,
GoogleSearchResults: GoogleIcon,