From a301ff3e58f25355b984f559d5e69bd9246b1884 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly <89296617+aparupganguly@users.noreply.github.com> Date: Wed, 5 Mar 2025 21:20:16 +0530 Subject: [PATCH] feat: add map and extract endpoints with v1 updates for scrape and crawl (#6787) * feat: add new map extraction feature * refactor: Improve Firecrawl component code quality and error handling * refactor: Remove redundant output_types from Firecrawl API components * [autofix.ci] apply automated fixes --------- Co-authored-by: Aparup Ganguly Co-authored-by: Gabriel Luiz Freitas Almeida Co-authored-by: Edwin Jose Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../langflow/components/firecrawl/__init__.py | 4 +- .../firecrawl/firecrawl_crawl_api.py | 19 ++- .../firecrawl/firecrawl_extract_api.py | 143 ++++++++++++++++++ .../components/firecrawl/firecrawl_map_api.py | 89 +++++++++++ .../firecrawl/firecrawl_scrape_api.py | 17 ++- src/frontend/src/utils/styleUtils.ts | 2 + 6 files changed, 263 insertions(+), 11 deletions(-) create mode 100644 src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py create mode 100644 src/backend/base/langflow/components/firecrawl/firecrawl_map_api.py diff --git a/src/backend/base/langflow/components/firecrawl/__init__.py b/src/backend/base/langflow/components/firecrawl/__init__.py index 9c1a2cec2..1f718f020 100644 --- a/src/backend/base/langflow/components/firecrawl/__init__.py +++ b/src/backend/base/langflow/components/firecrawl/__init__.py @@ -1,4 +1,6 @@ from .firecrawl_crawl_api import FirecrawlCrawlApi +from .firecrawl_extract_api import FirecrawlExtractApi +from .firecrawl_map_api import FirecrawlMapApi from .firecrawl_scrape_api import FirecrawlScrapeApi -__all__ = ["FirecrawlCrawlApi", "FirecrawlScrapeApi"] +__all__ = ["FirecrawlCrawlApi", "FirecrawlExtractApi", "FirecrawlMapApi", "FirecrawlScrapeApi"] diff --git a/src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py b/src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py index 0e85c8c2b..28e22d746 100644 --- a/src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py +++ b/src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py @@ -10,8 +10,7 @@ class FirecrawlCrawlApi(Component): description: str = "Firecrawl Crawl API." name = "FirecrawlCrawlApi" - output_types: list[str] = ["Document"] - documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl-post" + documentation: str = "https://docs.firecrawl.dev/v1/api-reference/endpoint/crawl-post" inputs = [ SecretStrInput( @@ -57,7 +56,7 @@ class FirecrawlCrawlApi(Component): def crawl(self) -> Data: try: - from firecrawl.firecrawl import FirecrawlApp + from firecrawl import FirecrawlApp except ImportError as e: msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`." raise ImportError(msg) from e @@ -67,6 +66,20 @@ class FirecrawlCrawlApi(Component): if scrape_options_dict: params["scrapeOptions"] = scrape_options_dict + # Set default values for new parameters in v1 + params.setdefault("maxDepth", 2) + params.setdefault("limit", 10000) + params.setdefault("allowExternalLinks", False) + params.setdefault("allowBackwardLinks", False) + params.setdefault("ignoreSitemap", False) + params.setdefault("ignoreQueryParameters", False) + + # Ensure onlyMainContent is explicitly set if not provided + if "scrapeOptions" in params: + params["scrapeOptions"].setdefault("onlyMainContent", True) + else: + params["scrapeOptions"] = {"onlyMainContent": True} + if not self.idempotency_key: self.idempotency_key = str(uuid.uuid4()) diff --git a/src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py b/src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py new file mode 100644 index 000000000..e1aa3bb2e --- /dev/null +++ b/src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py @@ -0,0 +1,143 @@ +from loguru import logger + +from langflow.custom import Component +from langflow.io import ( + BoolInput, + DataInput, + MultilineInput, + Output, + SecretStrInput, +) +from langflow.schema import Data + + +class FirecrawlExtractApi(Component): + display_name: str = "FirecrawlExtractApi" + description: str = "Firecrawl Extract API." + name = "FirecrawlExtractApi" + + documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/extract" + + inputs = [ + SecretStrInput( + name="api_key", + display_name="API Key", + required=True, + password=True, + info="The API key to use Firecrawl API.", + ), + MultilineInput( + name="urls", + display_name="URLs", + required=True, + info="List of URLs to extract data from (separated by commas or new lines).", + tool_mode=True, + ), + MultilineInput( + name="prompt", + display_name="Prompt", + required=True, + info="Prompt to guide the extraction process.", + tool_mode=True, + ), + DataInput( + name="schema", + display_name="Schema", + required=False, + info="Schema to define the structure of the extracted data.", + ), + BoolInput( + name="enable_web_search", + display_name="Enable Web Search", + info="When true, the extraction will use web search to find additional data.", + ), + # # Optional: Not essential for basic extraction + # BoolInput( + # name="ignore_sitemap", + # display_name="Ignore Sitemap", + # info="When true, sitemap.xml files will be ignored during website scanning.", + # ), + # # Optional: Not essential for basic extraction + # BoolInput( + # name="include_subdomains", + # display_name="Include Subdomains", + # info="When true, subdomains of the provided URLs will also be scanned.", + # ), + # # Optional: Not essential for basic extraction + # BoolInput( + # name="show_sources", + # display_name="Show Sources", + # info="When true, the sources used to extract the data will be included in the response.", + # ), + ] + + outputs = [ + Output(display_name="Data", name="data", method="extract"), + ] + + def extract(self) -> Data: + try: + from firecrawl import FirecrawlApp + except ImportError as e: + msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`." + raise ImportError(msg) from e + + # Validate API key + if not self.api_key: + msg = "API key is required" + raise ValueError(msg) + + # Validate URLs + if not self.urls: + msg = "URLs are required" + raise ValueError(msg) + + # Split and validate URLs (handle both commas and newlines) + urls = [url.strip() for url in self.urls.replace("\n", ",").split(",") if url.strip()] + if not urls: + msg = "No valid URLs provided" + raise ValueError(msg) + + # Validate and process prompt + if not self.prompt: + msg = "Prompt is required" + raise ValueError(msg) + + # Get the prompt text (handling both string and multiline input) + prompt_text = self.prompt.strip() + + # Enhance the prompt to encourage comprehensive extraction + enhanced_prompt = prompt_text + if "schema" not in prompt_text.lower(): + enhanced_prompt = f"{prompt_text}. Please extract all instances in a comprehensive, structured format." + + params = { + "prompt": enhanced_prompt, + "enableWebSearch": self.enable_web_search, + # Optional parameters - not essential for basic extraction + "ignoreSitemap": self.ignore_sitemap, + "includeSubdomains": self.include_subdomains, + "showSources": self.show_sources, + "timeout": 300, + } + + # Only add schema to params if it's provided and is a valid schema structure + if self.schema: + try: + if isinstance(self.schema, dict) and "type" in self.schema: + params["schema"] = self.schema + elif hasattr(self.schema, "dict") and "type" in self.schema.dict(): + params["schema"] = self.schema.dict() + else: + # Skip invalid schema without raising an error + pass + except Exception as e: # noqa: BLE001 + logger.error(f"Invalid schema: {e!s}") + + try: + app = FirecrawlApp(api_key=self.api_key) + extract_result = app.extract(urls, params=params) + return Data(data=extract_result) + except Exception as e: + msg = f"Error during extraction: {e!s}" + raise ValueError(msg) from e diff --git a/src/backend/base/langflow/components/firecrawl/firecrawl_map_api.py b/src/backend/base/langflow/components/firecrawl/firecrawl_map_api.py new file mode 100644 index 000000000..73dd55a02 --- /dev/null +++ b/src/backend/base/langflow/components/firecrawl/firecrawl_map_api.py @@ -0,0 +1,89 @@ +from langflow.custom import Component +from langflow.io import ( + BoolInput, + MultilineInput, + Output, + SecretStrInput, +) +from langflow.schema import Data + + +class FirecrawlMapApi(Component): + display_name: str = "FirecrawlMapApi" + description: str = "Firecrawl Map API." + name = "FirecrawlMapApi" + + documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/map" + + inputs = [ + SecretStrInput( + name="api_key", + display_name="API Key", + required=True, + password=True, + info="The API key to use Firecrawl API.", + ), + MultilineInput( + name="urls", + display_name="URLs", + required=True, + info="List of URLs to create maps from (separated by commas or new lines).", + tool_mode=True, + ), + BoolInput( + name="ignore_sitemap", + display_name="Ignore Sitemap", + info="When true, the sitemap.xml file will be ignored during crawling.", + ), + BoolInput( + name="sitemap_only", + display_name="Sitemap Only", + info="When true, only links found in the sitemap will be returned.", + ), + BoolInput( + name="include_subdomains", + display_name="Include Subdomains", + info="When true, subdomains of the provided URL will also be scanned.", + ), + ] + + outputs = [ + Output(display_name="Data", name="data", method="map"), + ] + + def map(self) -> Data: + try: + from firecrawl import FirecrawlApp + except ImportError as e: + msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`." + raise ImportError(msg) from e + + # Validate URLs + if not self.urls: + msg = "URLs are required" + raise ValueError(msg) + + # Split and validate URLs (handle both commas and newlines) + urls = [url.strip() for url in self.urls.replace("\n", ",").split(",") if url.strip()] + if not urls: + msg = "No valid URLs provided" + raise ValueError(msg) + + params = { + "ignoreSitemap": self.ignore_sitemap, + "sitemapOnly": self.sitemap_only, + "includeSubdomains": self.include_subdomains, + } + + app = FirecrawlApp(api_key=self.api_key) + + # Map all provided URLs and combine results + combined_links = [] + for url in urls: + result = app.map_url(url, params=params) + if isinstance(result, dict) and "links" in result: + combined_links.extend(result["links"]) + + map_result = {"success": True, "links": combined_links} + + return Data(data=map_result) diff --git a/src/backend/base/langflow/components/firecrawl/firecrawl_scrape_api.py b/src/backend/base/langflow/components/firecrawl/firecrawl_scrape_api.py index 735686285..d9989449f 100644 --- a/src/backend/base/langflow/components/firecrawl/firecrawl_scrape_api.py +++ b/src/backend/base/langflow/components/firecrawl/firecrawl_scrape_api.py @@ -14,7 +14,6 @@ class FirecrawlScrapeApi(Component): description: str = "Firecrawl Scrape API." name = "FirecrawlScrapeApi" - output_types: list[str] = ["Document"] documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/scrape" inputs = [ @@ -42,7 +41,7 @@ class FirecrawlScrapeApi(Component): display_name="Scrape Options", info="The page options to send with the request.", ), - DataInput( # https://docs.firecrawl.dev/features/extract + DataInput( name="extractorOptions", display_name="Extractor Options", info="The extractor options to send with the request.", @@ -50,21 +49,25 @@ class FirecrawlScrapeApi(Component): ] outputs = [ - Output(display_name="Data", name="data", method="crawl"), + Output(display_name="Data", name="data", method="scrape"), ] - def crawl(self) -> list[Data]: + def scrape(self) -> Data: try: - from firecrawl.firecrawl import FirecrawlApp + from firecrawl import FirecrawlApp except ImportError as e: msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`." raise ImportError(msg) from e - params = self.scrapeOptions.__dict__["data"] if self.scrapeOptions else {} - extractor_options_dict = self.extractorOptions.__dict__["data"] if self.extractorOptions else {} + params = self.scrapeOptions.__dict__.get("data", {}) if self.scrapeOptions else {} + extractor_options_dict = self.extractorOptions.__dict__.get("data", {}) if self.extractorOptions else {} if extractor_options_dict: params["extract"] = extractor_options_dict + # Set default values for parameters + params.setdefault("formats", ["markdown"]) # Default output format + params.setdefault("onlyMainContent", True) # Default to only main content + app = FirecrawlApp(api_key=self.api_key) results = app.scrape_url(self.url, params=params) return Data(data=results) diff --git a/src/frontend/src/utils/styleUtils.ts b/src/frontend/src/utils/styleUtils.ts index 05b950e5d..3c4f7c719 100644 --- a/src/frontend/src/utils/styleUtils.ts +++ b/src/frontend/src/utils/styleUtils.ts @@ -656,6 +656,8 @@ export const nodeIconsLucide: iconsType = { FacebookChatLoader: FBIcon, FirecrawlCrawlApi: FirecrawlIcon, FirecrawlScrapeApi: FirecrawlIcon, + FirecrawlMapApi: FirecrawlIcon, + FirecrawlExtractApi: FirecrawlIcon, GitbookLoader: GitBookIcon, GoogleSearchAPIWrapper: GoogleIcon, GoogleSearchResults: GoogleIcon,