feat: add map and extract endpoints with v1 updates for scrape and crawl (#6787)
* feat: add new map extraction feature * refactor: Improve Firecrawl component code quality and error handling * refactor: Remove redundant output_types from Firecrawl API components * [autofix.ci] apply automated fixes --------- Co-authored-by: Aparup Ganguly <aparup@Aparups-Mac-mini.local> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: Edwin Jose <edwin.jose@datastax.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
8f4515e655
commit
a301ff3e58
6 changed files with 263 additions and 11 deletions
|
|
@ -1,4 +1,6 @@
|
|||
from .firecrawl_crawl_api import FirecrawlCrawlApi
|
||||
from .firecrawl_extract_api import FirecrawlExtractApi
|
||||
from .firecrawl_map_api import FirecrawlMapApi
|
||||
from .firecrawl_scrape_api import FirecrawlScrapeApi
|
||||
|
||||
__all__ = ["FirecrawlCrawlApi", "FirecrawlScrapeApi"]
|
||||
__all__ = ["FirecrawlCrawlApi", "FirecrawlExtractApi", "FirecrawlMapApi", "FirecrawlScrapeApi"]
|
||||
|
|
|
|||
|
|
@ -10,8 +10,7 @@ class FirecrawlCrawlApi(Component):
|
|||
description: str = "Firecrawl Crawl API."
|
||||
name = "FirecrawlCrawlApi"
|
||||
|
||||
output_types: list[str] = ["Document"]
|
||||
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl-post"
|
||||
documentation: str = "https://docs.firecrawl.dev/v1/api-reference/endpoint/crawl-post"
|
||||
|
||||
inputs = [
|
||||
SecretStrInput(
|
||||
|
|
@ -57,7 +56,7 @@ class FirecrawlCrawlApi(Component):
|
|||
|
||||
def crawl(self) -> Data:
|
||||
try:
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
|
||||
raise ImportError(msg) from e
|
||||
|
|
@ -67,6 +66,20 @@ class FirecrawlCrawlApi(Component):
|
|||
if scrape_options_dict:
|
||||
params["scrapeOptions"] = scrape_options_dict
|
||||
|
||||
# Set default values for new parameters in v1
|
||||
params.setdefault("maxDepth", 2)
|
||||
params.setdefault("limit", 10000)
|
||||
params.setdefault("allowExternalLinks", False)
|
||||
params.setdefault("allowBackwardLinks", False)
|
||||
params.setdefault("ignoreSitemap", False)
|
||||
params.setdefault("ignoreQueryParameters", False)
|
||||
|
||||
# Ensure onlyMainContent is explicitly set if not provided
|
||||
if "scrapeOptions" in params:
|
||||
params["scrapeOptions"].setdefault("onlyMainContent", True)
|
||||
else:
|
||||
params["scrapeOptions"] = {"onlyMainContent": True}
|
||||
|
||||
if not self.idempotency_key:
|
||||
self.idempotency_key = str(uuid.uuid4())
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,143 @@
|
|||
from loguru import logger
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import (
|
||||
BoolInput,
|
||||
DataInput,
|
||||
MultilineInput,
|
||||
Output,
|
||||
SecretStrInput,
|
||||
)
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
class FirecrawlExtractApi(Component):
|
||||
display_name: str = "FirecrawlExtractApi"
|
||||
description: str = "Firecrawl Extract API."
|
||||
name = "FirecrawlExtractApi"
|
||||
|
||||
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/extract"
|
||||
|
||||
inputs = [
|
||||
SecretStrInput(
|
||||
name="api_key",
|
||||
display_name="API Key",
|
||||
required=True,
|
||||
password=True,
|
||||
info="The API key to use Firecrawl API.",
|
||||
),
|
||||
MultilineInput(
|
||||
name="urls",
|
||||
display_name="URLs",
|
||||
required=True,
|
||||
info="List of URLs to extract data from (separated by commas or new lines).",
|
||||
tool_mode=True,
|
||||
),
|
||||
MultilineInput(
|
||||
name="prompt",
|
||||
display_name="Prompt",
|
||||
required=True,
|
||||
info="Prompt to guide the extraction process.",
|
||||
tool_mode=True,
|
||||
),
|
||||
DataInput(
|
||||
name="schema",
|
||||
display_name="Schema",
|
||||
required=False,
|
||||
info="Schema to define the structure of the extracted data.",
|
||||
),
|
||||
BoolInput(
|
||||
name="enable_web_search",
|
||||
display_name="Enable Web Search",
|
||||
info="When true, the extraction will use web search to find additional data.",
|
||||
),
|
||||
# # Optional: Not essential for basic extraction
|
||||
# BoolInput(
|
||||
# name="ignore_sitemap",
|
||||
# display_name="Ignore Sitemap",
|
||||
# info="When true, sitemap.xml files will be ignored during website scanning.",
|
||||
# ),
|
||||
# # Optional: Not essential for basic extraction
|
||||
# BoolInput(
|
||||
# name="include_subdomains",
|
||||
# display_name="Include Subdomains",
|
||||
# info="When true, subdomains of the provided URLs will also be scanned.",
|
||||
# ),
|
||||
# # Optional: Not essential for basic extraction
|
||||
# BoolInput(
|
||||
# name="show_sources",
|
||||
# display_name="Show Sources",
|
||||
# info="When true, the sources used to extract the data will be included in the response.",
|
||||
# ),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Data", name="data", method="extract"),
|
||||
]
|
||||
|
||||
def extract(self) -> Data:
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
|
||||
raise ImportError(msg) from e
|
||||
|
||||
# Validate API key
|
||||
if not self.api_key:
|
||||
msg = "API key is required"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Validate URLs
|
||||
if not self.urls:
|
||||
msg = "URLs are required"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Split and validate URLs (handle both commas and newlines)
|
||||
urls = [url.strip() for url in self.urls.replace("\n", ",").split(",") if url.strip()]
|
||||
if not urls:
|
||||
msg = "No valid URLs provided"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Validate and process prompt
|
||||
if not self.prompt:
|
||||
msg = "Prompt is required"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Get the prompt text (handling both string and multiline input)
|
||||
prompt_text = self.prompt.strip()
|
||||
|
||||
# Enhance the prompt to encourage comprehensive extraction
|
||||
enhanced_prompt = prompt_text
|
||||
if "schema" not in prompt_text.lower():
|
||||
enhanced_prompt = f"{prompt_text}. Please extract all instances in a comprehensive, structured format."
|
||||
|
||||
params = {
|
||||
"prompt": enhanced_prompt,
|
||||
"enableWebSearch": self.enable_web_search,
|
||||
# Optional parameters - not essential for basic extraction
|
||||
"ignoreSitemap": self.ignore_sitemap,
|
||||
"includeSubdomains": self.include_subdomains,
|
||||
"showSources": self.show_sources,
|
||||
"timeout": 300,
|
||||
}
|
||||
|
||||
# Only add schema to params if it's provided and is a valid schema structure
|
||||
if self.schema:
|
||||
try:
|
||||
if isinstance(self.schema, dict) and "type" in self.schema:
|
||||
params["schema"] = self.schema
|
||||
elif hasattr(self.schema, "dict") and "type" in self.schema.dict():
|
||||
params["schema"] = self.schema.dict()
|
||||
else:
|
||||
# Skip invalid schema without raising an error
|
||||
pass
|
||||
except Exception as e: # noqa: BLE001
|
||||
logger.error(f"Invalid schema: {e!s}")
|
||||
|
||||
try:
|
||||
app = FirecrawlApp(api_key=self.api_key)
|
||||
extract_result = app.extract(urls, params=params)
|
||||
return Data(data=extract_result)
|
||||
except Exception as e:
|
||||
msg = f"Error during extraction: {e!s}"
|
||||
raise ValueError(msg) from e
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
from langflow.custom import Component
|
||||
from langflow.io import (
|
||||
BoolInput,
|
||||
MultilineInput,
|
||||
Output,
|
||||
SecretStrInput,
|
||||
)
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
class FirecrawlMapApi(Component):
|
||||
display_name: str = "FirecrawlMapApi"
|
||||
description: str = "Firecrawl Map API."
|
||||
name = "FirecrawlMapApi"
|
||||
|
||||
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/map"
|
||||
|
||||
inputs = [
|
||||
SecretStrInput(
|
||||
name="api_key",
|
||||
display_name="API Key",
|
||||
required=True,
|
||||
password=True,
|
||||
info="The API key to use Firecrawl API.",
|
||||
),
|
||||
MultilineInput(
|
||||
name="urls",
|
||||
display_name="URLs",
|
||||
required=True,
|
||||
info="List of URLs to create maps from (separated by commas or new lines).",
|
||||
tool_mode=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="ignore_sitemap",
|
||||
display_name="Ignore Sitemap",
|
||||
info="When true, the sitemap.xml file will be ignored during crawling.",
|
||||
),
|
||||
BoolInput(
|
||||
name="sitemap_only",
|
||||
display_name="Sitemap Only",
|
||||
info="When true, only links found in the sitemap will be returned.",
|
||||
),
|
||||
BoolInput(
|
||||
name="include_subdomains",
|
||||
display_name="Include Subdomains",
|
||||
info="When true, subdomains of the provided URL will also be scanned.",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Data", name="data", method="map"),
|
||||
]
|
||||
|
||||
def map(self) -> Data:
|
||||
try:
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
|
||||
raise ImportError(msg) from e
|
||||
|
||||
# Validate URLs
|
||||
if not self.urls:
|
||||
msg = "URLs are required"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Split and validate URLs (handle both commas and newlines)
|
||||
urls = [url.strip() for url in self.urls.replace("\n", ",").split(",") if url.strip()]
|
||||
if not urls:
|
||||
msg = "No valid URLs provided"
|
||||
raise ValueError(msg)
|
||||
|
||||
params = {
|
||||
"ignoreSitemap": self.ignore_sitemap,
|
||||
"sitemapOnly": self.sitemap_only,
|
||||
"includeSubdomains": self.include_subdomains,
|
||||
}
|
||||
|
||||
app = FirecrawlApp(api_key=self.api_key)
|
||||
|
||||
# Map all provided URLs and combine results
|
||||
combined_links = []
|
||||
for url in urls:
|
||||
result = app.map_url(url, params=params)
|
||||
if isinstance(result, dict) and "links" in result:
|
||||
combined_links.extend(result["links"])
|
||||
|
||||
map_result = {"success": True, "links": combined_links}
|
||||
|
||||
return Data(data=map_result)
|
||||
|
|
@ -14,7 +14,6 @@ class FirecrawlScrapeApi(Component):
|
|||
description: str = "Firecrawl Scrape API."
|
||||
name = "FirecrawlScrapeApi"
|
||||
|
||||
output_types: list[str] = ["Document"]
|
||||
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/scrape"
|
||||
|
||||
inputs = [
|
||||
|
|
@ -42,7 +41,7 @@ class FirecrawlScrapeApi(Component):
|
|||
display_name="Scrape Options",
|
||||
info="The page options to send with the request.",
|
||||
),
|
||||
DataInput( # https://docs.firecrawl.dev/features/extract
|
||||
DataInput(
|
||||
name="extractorOptions",
|
||||
display_name="Extractor Options",
|
||||
info="The extractor options to send with the request.",
|
||||
|
|
@ -50,21 +49,25 @@ class FirecrawlScrapeApi(Component):
|
|||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Data", name="data", method="crawl"),
|
||||
Output(display_name="Data", name="data", method="scrape"),
|
||||
]
|
||||
|
||||
def crawl(self) -> list[Data]:
|
||||
def scrape(self) -> Data:
|
||||
try:
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
from firecrawl import FirecrawlApp
|
||||
except ImportError as e:
|
||||
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
|
||||
raise ImportError(msg) from e
|
||||
|
||||
params = self.scrapeOptions.__dict__["data"] if self.scrapeOptions else {}
|
||||
extractor_options_dict = self.extractorOptions.__dict__["data"] if self.extractorOptions else {}
|
||||
params = self.scrapeOptions.__dict__.get("data", {}) if self.scrapeOptions else {}
|
||||
extractor_options_dict = self.extractorOptions.__dict__.get("data", {}) if self.extractorOptions else {}
|
||||
if extractor_options_dict:
|
||||
params["extract"] = extractor_options_dict
|
||||
|
||||
# Set default values for parameters
|
||||
params.setdefault("formats", ["markdown"]) # Default output format
|
||||
params.setdefault("onlyMainContent", True) # Default to only main content
|
||||
|
||||
app = FirecrawlApp(api_key=self.api_key)
|
||||
results = app.scrape_url(self.url, params=params)
|
||||
return Data(data=results)
|
||||
|
|
|
|||
|
|
@ -656,6 +656,8 @@ export const nodeIconsLucide: iconsType = {
|
|||
FacebookChatLoader: FBIcon,
|
||||
FirecrawlCrawlApi: FirecrawlIcon,
|
||||
FirecrawlScrapeApi: FirecrawlIcon,
|
||||
FirecrawlMapApi: FirecrawlIcon,
|
||||
FirecrawlExtractApi: FirecrawlIcon,
|
||||
GitbookLoader: GitBookIcon,
|
||||
GoogleSearchAPIWrapper: GoogleIcon,
|
||||
GoogleSearchResults: GoogleIcon,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue