feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com>
Co-authored-by: twwu <twwu@dify.ai>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Co-authored-by: jyong <718720800@qq.com>
Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com>
Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com>
Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com>
Co-authored-by: quicksand <quicksandzn@gmail.com>
Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com>
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Joel <iamjoel007@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: nite-knite <nkCoding@gmail.com>
Co-authored-by: Hanqing Zhao <sherry9277@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry <xh001x@hotmail.com>
This commit is contained in:
-LAN- 2025-09-18 12:49:10 +08:00 committed by GitHub
commit 85cda47c70
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
1772 changed files with 102407 additions and 31710 deletions

View file

@ -20,7 +20,7 @@ from configs import dify_config
from core.helper import ssrf_proxy
from core.model_runtime.utils.encoders import jsonable_encoder
from core.plugin.entities.plugin import PluginDependency
from core.workflow.nodes.enums import NodeType
from core.workflow.enums import NodeType
from core.workflow.nodes.knowledge_retrieval.entities import KnowledgeRetrievalNodeData
from core.workflow.nodes.llm.entities import LLMNodeData
from core.workflow.nodes.parameter_extractor.entities import ParameterExtractorNodeData

View file

@ -116,7 +116,6 @@ class AppGenerateService:
invoke_from=invoke_from,
streaming=streaming,
call_depth=0,
workflow_thread_pool_id=None,
),
),
request_id,

View file

@ -16,9 +16,9 @@ from werkzeug.exceptions import NotFound
from configs import dify_config
from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
from core.helper.name_generator import generate_incremental_name
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from core.plugin.entities.plugin import ModelProviderID
from core.rag.index_processor.constant.built_in_field import BuiltInField
from core.rag.index_processor.constant.index_type import IndexType
from core.rag.retrieval.retrieval_methods import RetrievalMethod
@ -43,9 +43,12 @@ from models.dataset import (
Document,
DocumentSegment,
ExternalKnowledgeBindings,
Pipeline,
)
from models.model import UploadFile
from models.provider_ids import ModelProviderID
from models.source import DataSourceOauthBinding
from models.workflow import Workflow
from services.entities.knowledge_entities.knowledge_entities import (
ChildChunkUpdateArgs,
KnowledgeConfig,
@ -53,6 +56,10 @@ from services.entities.knowledge_entities.knowledge_entities import (
RetrievalModel,
SegmentUpdateArgs,
)
from services.entities.knowledge_entities.rag_pipeline_entities import (
KnowledgeConfiguration,
RagPipelineDatasetCreateEntity,
)
from services.errors.account import NoPermissionError
from services.errors.chunk import ChildChunkDeleteIndexError, ChildChunkIndexingError
from services.errors.dataset import DatasetNameDuplicateError
@ -60,11 +67,13 @@ from services.errors.document import DocumentIndexingError
from services.errors.file import FileNotExistsError
from services.external_knowledge_service import ExternalDatasetService
from services.feature_service import FeatureModel, FeatureService
from services.rag_pipeline.rag_pipeline import RagPipelineService
from services.tag_service import TagService
from services.vector_service import VectorService
from tasks.add_document_to_index_task import add_document_to_index_task
from tasks.batch_clean_document_task import batch_clean_document_task
from tasks.clean_notion_document_task import clean_notion_document_task
from tasks.deal_dataset_index_update_task import deal_dataset_index_update_task
from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
from tasks.delete_segment_from_index_task import delete_segment_from_index_task
from tasks.disable_segment_from_index_task import disable_segment_from_index_task
@ -256,6 +265,55 @@ class DatasetService:
db.session.commit()
return dataset
@staticmethod
def create_empty_rag_pipeline_dataset(
tenant_id: str,
rag_pipeline_dataset_create_entity: RagPipelineDatasetCreateEntity,
):
if rag_pipeline_dataset_create_entity.name:
# check if dataset name already exists
if (
db.session.query(Dataset)
.filter_by(name=rag_pipeline_dataset_create_entity.name, tenant_id=tenant_id)
.first()
):
raise DatasetNameDuplicateError(
f"Dataset with name {rag_pipeline_dataset_create_entity.name} already exists."
)
else:
# generate a random name as Untitled 1 2 3 ...
datasets = db.session.query(Dataset).filter_by(tenant_id=tenant_id).all()
names = [dataset.name for dataset in datasets]
rag_pipeline_dataset_create_entity.name = generate_incremental_name(
names,
"Untitled",
)
if not current_user or not current_user.id:
raise ValueError("Current user or current user id not found")
pipeline = Pipeline(
tenant_id=tenant_id,
name=rag_pipeline_dataset_create_entity.name,
description=rag_pipeline_dataset_create_entity.description,
created_by=current_user.id,
)
db.session.add(pipeline)
db.session.flush()
dataset = Dataset(
tenant_id=tenant_id,
name=rag_pipeline_dataset_create_entity.name,
description=rag_pipeline_dataset_create_entity.description,
permission=rag_pipeline_dataset_create_entity.permission,
provider="vendor",
runtime_mode="rag_pipeline",
icon_info=rag_pipeline_dataset_create_entity.icon_info.model_dump(),
created_by=current_user.id,
pipeline_id=pipeline.id,
)
db.session.add(dataset)
db.session.commit()
return dataset
@staticmethod
def get_dataset(dataset_id) -> Dataset | None:
dataset: Dataset | None = db.session.query(Dataset).filter_by(id=dataset_id).first()
@ -339,6 +397,14 @@ class DatasetService:
dataset = DatasetService.get_dataset(dataset_id)
if not dataset:
raise ValueError("Dataset not found")
# check if dataset name is exists
if DatasetService._has_dataset_same_name(
tenant_id=dataset.tenant_id,
dataset_id=dataset_id,
name=data.get("name", dataset.name),
):
raise ValueError("Dataset name already exists")
# Verify user has permission to update this dataset
DatasetService.check_dataset_permission(dataset, user)
@ -349,6 +415,19 @@ class DatasetService:
else:
return DatasetService._update_internal_dataset(dataset, data, user)
@staticmethod
def _has_dataset_same_name(tenant_id: str, dataset_id: str, name: str):
dataset = (
db.session.query(Dataset)
.where(
Dataset.id != dataset_id,
Dataset.name == name,
Dataset.tenant_id == tenant_id,
)
.first()
)
return dataset is not None
@staticmethod
def _update_external_dataset(dataset, data, user):
"""
@ -454,17 +533,105 @@ class DatasetService:
filtered_data["updated_at"] = naive_utc_now()
# update Retrieval model
filtered_data["retrieval_model"] = data["retrieval_model"]
# update icon info
if data.get("icon_info"):
filtered_data["icon_info"] = data.get("icon_info")
# Update dataset in database
db.session.query(Dataset).filter_by(id=dataset.id).update(filtered_data)
db.session.commit()
# update pipeline knowledge base node data
DatasetService._update_pipeline_knowledge_base_node_data(dataset, user.id)
# Trigger vector index task if indexing technique changed
if action:
deal_dataset_vector_index_task.delay(dataset.id, action)
return dataset
@staticmethod
def _update_pipeline_knowledge_base_node_data(dataset: Dataset, updata_user_id: str):
"""
Update pipeline knowledge base node data.
"""
if dataset.runtime_mode != "rag_pipeline":
return
pipeline = db.session.query(Pipeline).filter_by(id=dataset.pipeline_id).first()
if not pipeline:
return
try:
rag_pipeline_service = RagPipelineService()
published_workflow = rag_pipeline_service.get_published_workflow(pipeline)
draft_workflow = rag_pipeline_service.get_draft_workflow(pipeline)
# update knowledge nodes
def update_knowledge_nodes(workflow_graph: str) -> str:
"""Update knowledge-index nodes in workflow graph."""
data: dict[str, Any] = json.loads(workflow_graph)
nodes = data.get("nodes", [])
updated = False
for node in nodes:
if node.get("data", {}).get("type") == "knowledge-index":
try:
knowledge_index_node_data = node.get("data", {})
knowledge_index_node_data["embedding_model"] = dataset.embedding_model
knowledge_index_node_data["embedding_model_provider"] = dataset.embedding_model_provider
knowledge_index_node_data["retrieval_model"] = dataset.retrieval_model
knowledge_index_node_data["chunk_structure"] = dataset.chunk_structure
knowledge_index_node_data["indexing_technique"] = dataset.indexing_technique # pyright: ignore[reportAttributeAccessIssue]
knowledge_index_node_data["keyword_number"] = dataset.keyword_number
node["data"] = knowledge_index_node_data
updated = True
except Exception:
logging.exception("Failed to update knowledge node")
continue
if updated:
data["nodes"] = nodes
return json.dumps(data)
return workflow_graph
# Update published workflow
if published_workflow:
updated_graph = update_knowledge_nodes(published_workflow.graph)
if updated_graph != published_workflow.graph:
# Create new workflow version
workflow = Workflow.new(
tenant_id=pipeline.tenant_id,
app_id=pipeline.id,
type=published_workflow.type,
version=str(datetime.datetime.now(datetime.UTC).replace(tzinfo=None)),
graph=updated_graph,
features=published_workflow.features,
created_by=updata_user_id,
environment_variables=published_workflow.environment_variables,
conversation_variables=published_workflow.conversation_variables,
rag_pipeline_variables=published_workflow.rag_pipeline_variables,
marked_name="",
marked_comment="",
)
db.session.add(workflow)
# Update draft workflow
if draft_workflow:
updated_graph = update_knowledge_nodes(draft_workflow.graph)
if updated_graph != draft_workflow.graph:
draft_workflow.graph = updated_graph
db.session.add(draft_workflow)
# Commit all changes in one transaction
db.session.commit()
except Exception:
logging.exception("Failed to update pipeline knowledge base node data")
db.session.rollback()
raise
@staticmethod
def _handle_indexing_technique_change(dataset, data, filtered_data):
"""
@ -654,6 +821,133 @@ class DatasetService:
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
@staticmethod
def update_rag_pipeline_dataset_settings(
session: Session, dataset: Dataset, knowledge_configuration: KnowledgeConfiguration, has_published: bool = False
):
if not current_user or not current_user.current_tenant_id:
raise ValueError("Current user or current tenant not found")
dataset = session.merge(dataset)
if not has_published:
dataset.chunk_structure = knowledge_configuration.chunk_structure
dataset.indexing_technique = knowledge_configuration.indexing_technique
if knowledge_configuration.indexing_technique == "high_quality":
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id, # ignore type error
provider=knowledge_configuration.embedding_model_provider or "",
model_type=ModelType.TEXT_EMBEDDING,
model=knowledge_configuration.embedding_model or "",
)
dataset.embedding_model = embedding_model.model
dataset.embedding_model_provider = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
dataset.collection_binding_id = dataset_collection_binding.id
elif knowledge_configuration.indexing_technique == "economy":
dataset.keyword_number = knowledge_configuration.keyword_number
else:
raise ValueError("Invalid index method")
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
session.add(dataset)
else:
if dataset.chunk_structure and dataset.chunk_structure != knowledge_configuration.chunk_structure:
raise ValueError("Chunk structure is not allowed to be updated.")
action = None
if dataset.indexing_technique != knowledge_configuration.indexing_technique:
# if update indexing_technique
if knowledge_configuration.indexing_technique == "economy":
raise ValueError("Knowledge base indexing technique is not allowed to be updated to economy.")
elif knowledge_configuration.indexing_technique == "high_quality":
action = "add"
# get embedding model setting
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=knowledge_configuration.embedding_model_provider,
model_type=ModelType.TEXT_EMBEDDING,
model=knowledge_configuration.embedding_model,
)
dataset.embedding_model = embedding_model.model
dataset.embedding_model_provider = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
dataset.collection_binding_id = dataset_collection_binding.id
dataset.indexing_technique = knowledge_configuration.indexing_technique
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
else:
# add default plugin id to both setting sets, to make sure the plugin model provider is consistent
# Skip embedding model checks if not provided in the update request
if dataset.indexing_technique == "high_quality":
skip_embedding_update = False
try:
# Handle existing model provider
plugin_model_provider = dataset.embedding_model_provider
plugin_model_provider_str = None
if plugin_model_provider:
plugin_model_provider_str = str(ModelProviderID(plugin_model_provider))
# Handle new model provider from request
new_plugin_model_provider = knowledge_configuration.embedding_model_provider
new_plugin_model_provider_str = None
if new_plugin_model_provider:
new_plugin_model_provider_str = str(ModelProviderID(new_plugin_model_provider))
# Only update embedding model if both values are provided and different from current
if (
plugin_model_provider_str != new_plugin_model_provider_str
or knowledge_configuration.embedding_model != dataset.embedding_model
):
action = "update"
model_manager = ModelManager()
embedding_model = None
try:
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=knowledge_configuration.embedding_model_provider,
model_type=ModelType.TEXT_EMBEDDING,
model=knowledge_configuration.embedding_model,
)
except ProviderTokenNotInitError:
# If we can't get the embedding model, skip updating it
# and keep the existing settings if available
# Skip the rest of the embedding model update
skip_embedding_update = True
if not skip_embedding_update:
if embedding_model:
dataset.embedding_model = embedding_model.model
dataset.embedding_model_provider = embedding_model.provider
dataset_collection_binding = (
DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
)
dataset.collection_binding_id = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
elif dataset.indexing_technique == "economy":
if dataset.keyword_number != knowledge_configuration.keyword_number:
dataset.keyword_number = knowledge_configuration.keyword_number
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
session.add(dataset)
session.commit()
if action:
deal_dataset_index_update_task.delay(dataset.id, action)
@staticmethod
def delete_dataset(dataset_id, user):
dataset = DatasetService.get_dataset(dataset_id)
@ -730,6 +1024,18 @@ class DatasetService:
.all()
)
@staticmethod
def update_dataset_api_status(dataset_id: str, status: bool):
dataset = DatasetService.get_dataset(dataset_id)
if dataset is None:
raise NotFound("Dataset not found.")
dataset.enable_api = status
if not current_user or not current_user.id:
raise ValueError("Current user or current user id not found")
dataset.updated_by = current_user.id
dataset.updated_at = naive_utc_now()
db.session.commit()
@staticmethod
def get_dataset_auto_disable_logs(dataset_id: str):
assert isinstance(current_user, Account)
@ -974,7 +1280,7 @@ class DocumentService:
return
documents = db.session.scalars(select(Document).where(Document.id.in_(document_ids))).all()
file_ids = [
document.data_source_info_dict["upload_file_id"]
document.data_source_info_dict.get("upload_file_id", "")
for document in documents
if document.data_source_type == "upload_file" and document.data_source_info_dict
]
@ -1062,7 +1368,9 @@ class DocumentService:
redis_client.setex(retry_indexing_cache_key, 600, 1)
# trigger async task
document_ids = [document.id for document in documents]
retry_document_indexing_task.delay(dataset_id, document_ids)
if not current_user or not current_user.id:
raise ValueError("Current user or current user id not found")
retry_document_indexing_task.delay(dataset_id, document_ids, current_user.id)
@staticmethod
def sync_website_document(dataset_id: str, document: Document):
@ -1211,7 +1519,7 @@ class DocumentService:
)
return [], ""
db.session.add(dataset_process_rule)
db.session.commit()
db.session.flush()
lock_name = f"add_document_lock_dataset_id_{dataset.id}"
with redis_client.lock(lock_name, timeout=600):
position = DocumentService.get_documents_position(dataset.id)
@ -1301,23 +1609,10 @@ class DocumentService:
exist_document[data_source_info["notion_page_id"]] = document.id
for notion_info in notion_info_list:
workspace_id = notion_info.workspace_id
data_source_binding = (
db.session.query(DataSourceOauthBinding)
.where(
db.and_(
DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
DataSourceOauthBinding.provider == "notion",
DataSourceOauthBinding.disabled == False,
DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
)
)
.first()
)
if not data_source_binding:
raise ValueError("Data source binding not found.")
for page in notion_info.pages:
if page.page_id not in exist_page_ids:
data_source_info = {
"credential_id": notion_info.credential_id,
"notion_workspace_id": workspace_id,
"notion_page_id": page.page_id,
"notion_page_icon": page.page_icon.model_dump() if page.page_icon else None,
@ -1393,6 +1688,283 @@ class DocumentService:
return documents, batch
# @staticmethod
# def save_document_with_dataset_id(
# dataset: Dataset,
# knowledge_config: KnowledgeConfig,
# account: Account | Any,
# dataset_process_rule: Optional[DatasetProcessRule] = None,
# created_from: str = "web",
# ):
# # check document limit
# features = FeatureService.get_features(current_user.current_tenant_id)
# if features.billing.enabled:
# if not knowledge_config.original_document_id:
# count = 0
# if knowledge_config.data_source:
# if knowledge_config.data_source.info_list.data_source_type == "upload_file":
# upload_file_list = knowledge_config.data_source.info_list.file_info_list.file_ids
# # type: ignore
# count = len(upload_file_list)
# elif knowledge_config.data_source.info_list.data_source_type == "notion_import":
# notion_info_list = knowledge_config.data_source.info_list.notion_info_list
# for notion_info in notion_info_list: # type: ignore
# count = count + len(notion_info.pages)
# elif knowledge_config.data_source.info_list.data_source_type == "website_crawl":
# website_info = knowledge_config.data_source.info_list.website_info_list
# count = len(website_info.urls) # type: ignore
# batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT)
# if features.billing.subscription.plan == "sandbox" and count > 1:
# raise ValueError("Your current plan does not support batch upload, please upgrade your plan.")
# if count > batch_upload_limit:
# raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
# DocumentService.check_documents_upload_quota(count, features)
# # if dataset is empty, update dataset data_source_type
# if not dataset.data_source_type:
# dataset.data_source_type = knowledge_config.data_source.info_list.data_source_type # type: ignore
# if not dataset.indexing_technique:
# if knowledge_config.indexing_technique not in Dataset.INDEXING_TECHNIQUE_LIST:
# raise ValueError("Indexing technique is invalid")
# dataset.indexing_technique = knowledge_config.indexing_technique
# if knowledge_config.indexing_technique == "high_quality":
# model_manager = ModelManager()
# if knowledge_config.embedding_model and knowledge_config.embedding_model_provider:
# dataset_embedding_model = knowledge_config.embedding_model
# dataset_embedding_model_provider = knowledge_config.embedding_model_provider
# else:
# embedding_model = model_manager.get_default_model_instance(
# tenant_id=current_user.current_tenant_id, model_type=ModelType.TEXT_EMBEDDING
# )
# dataset_embedding_model = embedding_model.model
# dataset_embedding_model_provider = embedding_model.provider
# dataset.embedding_model = dataset_embedding_model
# dataset.embedding_model_provider = dataset_embedding_model_provider
# dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
# dataset_embedding_model_provider, dataset_embedding_model
# )
# dataset.collection_binding_id = dataset_collection_binding.id
# if not dataset.retrieval_model:
# default_retrieval_model = {
# "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
# "reranking_enable": False,
# "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
# "top_k": 2,
# "score_threshold_enabled": False,
# }
# dataset.retrieval_model = (
# knowledge_config.retrieval_model.model_dump()
# if knowledge_config.retrieval_model
# else default_retrieval_model
# ) # type: ignore
# documents = []
# if knowledge_config.original_document_id:
# document = DocumentService.update_document_with_dataset_id(dataset, knowledge_config, account)
# documents.append(document)
# batch = document.batch
# else:
# batch = time.strftime("%Y%m%d%H%M%S") + str(random.randint(100000, 999999))
# # save process rule
# if not dataset_process_rule:
# process_rule = knowledge_config.process_rule
# if process_rule:
# if process_rule.mode in ("custom", "hierarchical"):
# dataset_process_rule = DatasetProcessRule(
# dataset_id=dataset.id,
# mode=process_rule.mode,
# rules=process_rule.rules.model_dump_json() if process_rule.rules else None,
# created_by=account.id,
# )
# elif process_rule.mode == "automatic":
# dataset_process_rule = DatasetProcessRule(
# dataset_id=dataset.id,
# mode=process_rule.mode,
# rules=json.dumps(DatasetProcessRule.AUTOMATIC_RULES),
# created_by=account.id,
# )
# else:
# logging.warn(
# f"Invalid process rule mode: {process_rule.mode}, can not find dataset process rule"
# )
# return
# db.session.add(dataset_process_rule)
# db.session.commit()
# lock_name = "add_document_lock_dataset_id_{}".format(dataset.id)
# with redis_client.lock(lock_name, timeout=600):
# position = DocumentService.get_documents_position(dataset.id)
# document_ids = []
# duplicate_document_ids = []
# if knowledge_config.data_source.info_list.data_source_type == "upload_file": # type: ignore
# upload_file_list = knowledge_config.data_source.info_list.file_info_list.file_ids # type: ignore
# for file_id in upload_file_list:
# file = (
# db.session.query(UploadFile)
# .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
# .first()
# )
# # raise error if file not found
# if not file:
# raise FileNotExistsError()
# file_name = file.name
# data_source_info = {
# "upload_file_id": file_id,
# }
# # check duplicate
# if knowledge_config.duplicate:
# document = Document.query.filter_by(
# dataset_id=dataset.id,
# tenant_id=current_user.current_tenant_id,
# data_source_type="upload_file",
# enabled=True,
# name=file_name,
# ).first()
# if document:
# document.dataset_process_rule_id = dataset_process_rule.id # type: ignore
# document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
# document.created_from = created_from
# document.doc_form = knowledge_config.doc_form
# document.doc_language = knowledge_config.doc_language
# document.data_source_info = json.dumps(data_source_info)
# document.batch = batch
# document.indexing_status = "waiting"
# db.session.add(document)
# documents.append(document)
# duplicate_document_ids.append(document.id)
# continue
# document = DocumentService.build_document(
# dataset,
# dataset_process_rule.id, # type: ignore
# knowledge_config.data_source.info_list.data_source_type, # type: ignore
# knowledge_config.doc_form,
# knowledge_config.doc_language,
# data_source_info,
# created_from,
# position,
# account,
# file_name,
# batch,
# )
# db.session.add(document)
# db.session.flush()
# document_ids.append(document.id)
# documents.append(document)
# position += 1
# elif knowledge_config.data_source.info_list.data_source_type == "notion_import": # type: ignore
# notion_info_list = knowledge_config.data_source.info_list.notion_info_list # type: ignore
# if not notion_info_list:
# raise ValueError("No notion info list found.")
# exist_page_ids = []
# exist_document = {}
# documents = Document.query.filter_by(
# dataset_id=dataset.id,
# tenant_id=current_user.current_tenant_id,
# data_source_type="notion_import",
# enabled=True,
# ).all()
# if documents:
# for document in documents:
# data_source_info = json.loads(document.data_source_info)
# exist_page_ids.append(data_source_info["notion_page_id"])
# exist_document[data_source_info["notion_page_id"]] = document.id
# for notion_info in notion_info_list:
# workspace_id = notion_info.workspace_id
# data_source_binding = DataSourceOauthBinding.query.filter(
# db.and_(
# DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
# DataSourceOauthBinding.provider == "notion",
# DataSourceOauthBinding.disabled == False,
# DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
# )
# ).first()
# if not data_source_binding:
# raise ValueError("Data source binding not found.")
# for page in notion_info.pages:
# if page.page_id not in exist_page_ids:
# data_source_info = {
# "notion_workspace_id": workspace_id,
# "notion_page_id": page.page_id,
# "notion_page_icon": page.page_icon.model_dump() if page.page_icon else None,
# "type": page.type,
# }
# # Truncate page name to 255 characters to prevent DB field length errors
# truncated_page_name = page.page_name[:255] if page.page_name else "nopagename"
# document = DocumentService.build_document(
# dataset,
# dataset_process_rule.id, # type: ignore
# knowledge_config.data_source.info_list.data_source_type, # type: ignore
# knowledge_config.doc_form,
# knowledge_config.doc_language,
# data_source_info,
# created_from,
# position,
# account,
# truncated_page_name,
# batch,
# )
# db.session.add(document)
# db.session.flush()
# document_ids.append(document.id)
# documents.append(document)
# position += 1
# else:
# exist_document.pop(page.page_id)
# # delete not selected documents
# if len(exist_document) > 0:
# clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
# elif knowledge_config.data_source.info_list.data_source_type == "website_crawl": # type: ignore
# website_info = knowledge_config.data_source.info_list.website_info_list # type: ignore
# if not website_info:
# raise ValueError("No website info list found.")
# urls = website_info.urls
# for url in urls:
# data_source_info = {
# "url": url,
# "provider": website_info.provider,
# "job_id": website_info.job_id,
# "only_main_content": website_info.only_main_content,
# "mode": "crawl",
# }
# if len(url) > 255:
# document_name = url[:200] + "..."
# else:
# document_name = url
# document = DocumentService.build_document(
# dataset,
# dataset_process_rule.id, # type: ignore
# knowledge_config.data_source.info_list.data_source_type, # type: ignore
# knowledge_config.doc_form,
# knowledge_config.doc_language,
# data_source_info,
# created_from,
# position,
# account,
# document_name,
# batch,
# )
# db.session.add(document)
# db.session.flush()
# document_ids.append(document.id)
# documents.append(document)
# position += 1
# db.session.commit()
# # trigger async task
# if document_ids:
# document_indexing_task.delay(dataset.id, document_ids)
# if duplicate_document_ids:
# duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
# return documents, batch
@staticmethod
def check_documents_upload_quota(count: int, features: FeatureModel):
can_upload_size = features.documents_upload_quota.limit - features.documents_upload_quota.size
@ -1404,7 +1976,7 @@ class DocumentService:
@staticmethod
def build_document(
dataset: Dataset,
process_rule_id: str,
process_rule_id: str | None,
data_source_type: str,
document_form: str,
document_language: str,
@ -1540,6 +2112,7 @@ class DocumentService:
raise ValueError("Data source binding not found.")
for page in notion_info.pages:
data_source_info = {
"credential_id": notion_info.credential_id,
"notion_workspace_id": workspace_id,
"notion_page_id": page.page_id,
"notion_page_icon": page.page_icon.model_dump() if page.page_icon else None, # type: ignore
@ -2352,6 +2925,8 @@ class SegmentService:
segment.error = str(e)
db.session.commit()
new_segment = db.session.query(DocumentSegment).where(DocumentSegment.id == segment.id).first()
if not new_segment:
raise ValueError("new_segment is not found")
return new_segment
@classmethod
@ -2430,9 +3005,11 @@ class SegmentService:
if index_node_ids or child_node_ids:
delete_segment_from_index_task.delay(index_node_ids, dataset.id, document.id, child_node_ids)
document.word_count = (
document.word_count - total_words if document.word_count and document.word_count > total_words else 0
)
if document.word_count is None:
document.word_count = 0
else:
document.word_count = max(0, document.word_count - total_words)
db.session.add(document)
# Delete database records

View file

@ -0,0 +1,975 @@
import logging
import time
from collections.abc import Mapping
from typing import Any
from flask_login import current_user
from sqlalchemy.orm import Session
from configs import dify_config
from constants import HIDDEN_VALUE, UNKNOWN_VALUE
from core.helper import encrypter
from core.helper.name_generator import generate_incremental_name
from core.helper.provider_cache import NoOpProviderCredentialCache
from core.model_runtime.entities.provider_entities import FormType
from core.plugin.impl.datasource import PluginDatasourceManager
from core.plugin.impl.oauth import OAuthHandler
from core.tools.entities.tool_entities import CredentialType
from core.tools.utils.encryption import ProviderConfigCache, ProviderConfigEncrypter, create_provider_encrypter
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.oauth import DatasourceOauthParamConfig, DatasourceOauthTenantParamConfig, DatasourceProvider
from models.provider_ids import DatasourceProviderID
from services.plugin.plugin_service import PluginService
logger = logging.getLogger(__name__)
class DatasourceProviderService:
"""
Model Provider Service
"""
def __init__(self) -> None:
self.provider_manager = PluginDatasourceManager()
def remove_oauth_custom_client_params(self, tenant_id: str, datasource_provider_id: DatasourceProviderID):
"""
remove oauth custom client params
"""
with Session(db.engine) as session:
session.query(DatasourceOauthTenantParamConfig).filter_by(
tenant_id=tenant_id,
provider=datasource_provider_id.provider_name,
plugin_id=datasource_provider_id.plugin_id,
).delete()
session.commit()
def decrypt_datasource_provider_credentials(
self,
tenant_id: str,
datasource_provider: DatasourceProvider,
plugin_id: str,
provider: str,
) -> dict[str, Any]:
encrypted_credentials = datasource_provider.encrypted_credentials
credential_secret_variables = self.extract_secret_variables(
tenant_id=tenant_id,
provider_id=f"{plugin_id}/{provider}",
credential_type=CredentialType.of(datasource_provider.auth_type),
)
decrypted_credentials = encrypted_credentials.copy()
for key, value in decrypted_credentials.items():
if key in credential_secret_variables:
decrypted_credentials[key] = encrypter.decrypt_token(tenant_id, value)
return decrypted_credentials
def encrypt_datasource_provider_credentials(
self,
tenant_id: str,
provider: str,
plugin_id: str,
raw_credentials: Mapping[str, Any],
datasource_provider: DatasourceProvider,
) -> dict[str, Any]:
provider_credential_secret_variables = self.extract_secret_variables(
tenant_id=tenant_id,
provider_id=f"{plugin_id}/{provider}",
credential_type=CredentialType.of(datasource_provider.auth_type),
)
encrypted_credentials = dict(raw_credentials)
for key, value in encrypted_credentials.items():
if key in provider_credential_secret_variables:
encrypted_credentials[key] = encrypter.encrypt_token(tenant_id, value)
return encrypted_credentials
def get_datasource_credentials(
self,
tenant_id: str,
provider: str,
plugin_id: str,
credential_id: str | None = None,
) -> dict[str, Any]:
"""
get credential by id
"""
with Session(db.engine) as session:
if credential_id:
datasource_provider = (
session.query(DatasourceProvider).filter_by(tenant_id=tenant_id, id=credential_id).first()
)
else:
datasource_provider = (
session.query(DatasourceProvider)
.filter_by(tenant_id=tenant_id, provider=provider, plugin_id=plugin_id)
.order_by(DatasourceProvider.is_default.desc(), DatasourceProvider.created_at.asc())
.first()
)
if not datasource_provider:
return {}
# refresh the credentials
if datasource_provider.expires_at != -1 and (datasource_provider.expires_at - 60) < int(time.time()):
decrypted_credentials = self.decrypt_datasource_provider_credentials(
tenant_id=tenant_id,
datasource_provider=datasource_provider,
plugin_id=plugin_id,
provider=provider,
)
datasource_provider_id = DatasourceProviderID(f"{plugin_id}/{provider}")
provider_name = datasource_provider_id.provider_name
redirect_uri = (
f"{dify_config.CONSOLE_API_URL}/console/api/oauth/plugin/"
f"{datasource_provider_id}/datasource/callback"
)
system_credentials = self.get_oauth_client(tenant_id, datasource_provider_id)
refreshed_credentials = OAuthHandler().refresh_credentials(
tenant_id=tenant_id,
user_id=current_user.id,
plugin_id=datasource_provider_id.plugin_id,
provider=provider_name,
redirect_uri=redirect_uri,
system_credentials=system_credentials or {},
credentials=decrypted_credentials,
)
datasource_provider.encrypted_credentials = self.encrypt_datasource_provider_credentials(
tenant_id=tenant_id,
raw_credentials=refreshed_credentials.credentials,
provider=provider,
plugin_id=plugin_id,
datasource_provider=datasource_provider,
)
datasource_provider.expires_at = refreshed_credentials.expires_at
session.commit()
return self.decrypt_datasource_provider_credentials(
tenant_id=tenant_id,
datasource_provider=datasource_provider,
plugin_id=plugin_id,
provider=provider,
)
def get_all_datasource_credentials_by_provider(
self,
tenant_id: str,
provider: str,
plugin_id: str,
) -> list[dict[str, Any]]:
"""
get all datasource credentials by provider
"""
with Session(db.engine) as session:
datasource_providers = (
session.query(DatasourceProvider)
.filter_by(tenant_id=tenant_id, provider=provider, plugin_id=plugin_id)
.order_by(DatasourceProvider.is_default.desc(), DatasourceProvider.created_at.asc())
.all()
)
if not datasource_providers:
return []
# refresh the credentials
real_credentials_list = []
for datasource_provider in datasource_providers:
decrypted_credentials = self.decrypt_datasource_provider_credentials(
tenant_id=tenant_id,
datasource_provider=datasource_provider,
plugin_id=plugin_id,
provider=provider,
)
datasource_provider_id = DatasourceProviderID(f"{plugin_id}/{provider}")
provider_name = datasource_provider_id.provider_name
redirect_uri = (
f"{dify_config.CONSOLE_API_URL}/console/api/oauth/plugin/"
f"{datasource_provider_id}/datasource/callback"
)
system_credentials = self.get_oauth_client(tenant_id, datasource_provider_id)
refreshed_credentials = OAuthHandler().refresh_credentials(
tenant_id=tenant_id,
user_id=current_user.id,
plugin_id=datasource_provider_id.plugin_id,
provider=provider_name,
redirect_uri=redirect_uri,
system_credentials=system_credentials or {},
credentials=decrypted_credentials,
)
datasource_provider.encrypted_credentials = self.encrypt_datasource_provider_credentials(
tenant_id=tenant_id,
raw_credentials=refreshed_credentials.credentials,
provider=provider,
plugin_id=plugin_id,
datasource_provider=datasource_provider,
)
datasource_provider.expires_at = refreshed_credentials.expires_at
real_credentials = self.decrypt_datasource_provider_credentials(
tenant_id=tenant_id,
datasource_provider=datasource_provider,
plugin_id=plugin_id,
provider=provider,
)
real_credentials_list.append(real_credentials)
session.commit()
return real_credentials_list
def update_datasource_provider_name(
self, tenant_id: str, datasource_provider_id: DatasourceProviderID, name: str, credential_id: str
):
"""
update datasource provider name
"""
with Session(db.engine) as session:
target_provider = (
session.query(DatasourceProvider)
.filter_by(
tenant_id=tenant_id,
id=credential_id,
provider=datasource_provider_id.provider_name,
plugin_id=datasource_provider_id.plugin_id,
)
.first()
)
if target_provider is None:
raise ValueError("provider not found")
if target_provider.name == name:
return
# check name is exist
if (
session.query(DatasourceProvider)
.filter_by(
tenant_id=tenant_id,
name=name,
provider=datasource_provider_id.provider_name,
plugin_id=datasource_provider_id.plugin_id,
)
.count()
> 0
):
raise ValueError("Authorization name is already exists")
target_provider.name = name
session.commit()
return
def set_default_datasource_provider(
self, tenant_id: str, datasource_provider_id: DatasourceProviderID, credential_id: str
):
"""
set default datasource provider
"""
with Session(db.engine) as session:
# get provider
target_provider = (
session.query(DatasourceProvider)
.filter_by(
tenant_id=tenant_id,
id=credential_id,
provider=datasource_provider_id.provider_name,
plugin_id=datasource_provider_id.plugin_id,
)
.first()
)
if target_provider is None:
raise ValueError("provider not found")
# clear default provider
session.query(DatasourceProvider).filter_by(
tenant_id=tenant_id,
provider=target_provider.provider,
plugin_id=target_provider.plugin_id,
is_default=True,
).update({"is_default": False})
# set new default provider
target_provider.is_default = True
session.commit()
return {"result": "success"}
def setup_oauth_custom_client_params(
self,
tenant_id: str,
datasource_provider_id: DatasourceProviderID,
client_params: dict | None,
enabled: bool | None,
):
"""
setup oauth custom client params
"""
if client_params is None and enabled is None:
return
with Session(db.engine) as session:
tenant_oauth_client_params = (
session.query(DatasourceOauthTenantParamConfig)
.filter_by(
tenant_id=tenant_id,
provider=datasource_provider_id.provider_name,
plugin_id=datasource_provider_id.plugin_id,
)
.first()
)
if not tenant_oauth_client_params:
tenant_oauth_client_params = DatasourceOauthTenantParamConfig(
tenant_id=tenant_id,
provider=datasource_provider_id.provider_name,
plugin_id=datasource_provider_id.plugin_id,
client_params={},
enabled=False,
)
session.add(tenant_oauth_client_params)
if client_params is not None:
encrypter, _ = self.get_oauth_encrypter(tenant_id, datasource_provider_id)
original_params = (
encrypter.decrypt(tenant_oauth_client_params.client_params) if tenant_oauth_client_params else {}
)
new_params: dict = {
key: value if value != HIDDEN_VALUE else original_params.get(key, UNKNOWN_VALUE)
for key, value in client_params.items()
}
tenant_oauth_client_params.client_params = encrypter.encrypt(new_params)
if enabled is not None:
tenant_oauth_client_params.enabled = enabled
session.commit()
def is_system_oauth_params_exist(self, datasource_provider_id: DatasourceProviderID) -> bool:
"""
check if system oauth params exist
"""
with Session(db.engine).no_autoflush as session:
return (
session.query(DatasourceOauthParamConfig)
.filter_by(provider=datasource_provider_id.provider_name, plugin_id=datasource_provider_id.plugin_id)
.first()
is not None
)
def is_tenant_oauth_params_enabled(self, tenant_id: str, datasource_provider_id: DatasourceProviderID) -> bool:
"""
check if tenant oauth params is enabled
"""
return (
db.session.query(DatasourceOauthTenantParamConfig)
.filter_by(
tenant_id=tenant_id,
provider=datasource_provider_id.provider_name,
plugin_id=datasource_provider_id.plugin_id,
enabled=True,
)
.count()
> 0
)
def get_tenant_oauth_client(
self, tenant_id: str, datasource_provider_id: DatasourceProviderID, mask: bool = False
) -> dict[str, Any] | None:
"""
get tenant oauth client
"""
tenant_oauth_client_params = (
db.session.query(DatasourceOauthTenantParamConfig)
.filter_by(
tenant_id=tenant_id,
provider=datasource_provider_id.provider_name,
plugin_id=datasource_provider_id.plugin_id,
)
.first()
)
if tenant_oauth_client_params:
encrypter, _ = self.get_oauth_encrypter(tenant_id, datasource_provider_id)
if mask:
return encrypter.mask_tool_credentials(encrypter.decrypt(tenant_oauth_client_params.client_params))
else:
return encrypter.decrypt(tenant_oauth_client_params.client_params)
return None
def get_oauth_encrypter(
self, tenant_id: str, datasource_provider_id: DatasourceProviderID
) -> tuple[ProviderConfigEncrypter, ProviderConfigCache]:
"""
get oauth encrypter
"""
datasource_provider = self.provider_manager.fetch_datasource_provider(
tenant_id=tenant_id, provider_id=str(datasource_provider_id)
)
if not datasource_provider.declaration.oauth_schema:
raise ValueError("Datasource provider oauth schema not found")
client_schema = datasource_provider.declaration.oauth_schema.client_schema
return create_provider_encrypter(
tenant_id=tenant_id,
config=[x.to_basic_provider_config() for x in client_schema],
cache=NoOpProviderCredentialCache(),
)
def get_oauth_client(self, tenant_id: str, datasource_provider_id: DatasourceProviderID) -> dict[str, Any] | None:
"""
get oauth client
"""
provider = datasource_provider_id.provider_name
plugin_id = datasource_provider_id.plugin_id
with Session(db.engine).no_autoflush as session:
# get tenant oauth client params
tenant_oauth_client_params = (
session.query(DatasourceOauthTenantParamConfig)
.filter_by(
tenant_id=tenant_id,
provider=provider,
plugin_id=plugin_id,
enabled=True,
)
.first()
)
if tenant_oauth_client_params:
encrypter, _ = self.get_oauth_encrypter(tenant_id, datasource_provider_id)
return encrypter.decrypt(tenant_oauth_client_params.client_params)
provider_controller = self.provider_manager.fetch_datasource_provider(
tenant_id=tenant_id, provider_id=str(datasource_provider_id)
)
is_verified = PluginService.is_plugin_verified(tenant_id, provider_controller.plugin_unique_identifier)
if is_verified:
# fallback to system oauth client params
oauth_client_params = (
session.query(DatasourceOauthParamConfig).filter_by(provider=provider, plugin_id=plugin_id).first()
)
if oauth_client_params:
return oauth_client_params.system_credentials
raise ValueError(f"Please configure oauth client params(system/tenant) for {plugin_id}/{provider}")
@staticmethod
def generate_next_datasource_provider_name(
session: Session, tenant_id: str, provider_id: DatasourceProviderID, credential_type: CredentialType
) -> str:
db_providers = (
session.query(DatasourceProvider)
.filter_by(
tenant_id=tenant_id,
provider=provider_id.provider_name,
plugin_id=provider_id.plugin_id,
)
.all()
)
return generate_incremental_name(
[provider.name for provider in db_providers],
f"{credential_type.get_name()}",
)
def reauthorize_datasource_oauth_provider(
self,
name: str | None,
tenant_id: str,
provider_id: DatasourceProviderID,
avatar_url: str | None,
expire_at: int,
credentials: dict,
credential_id: str,
) -> None:
"""
update datasource oauth provider
"""
with Session(db.engine) as session:
lock = f"datasource_provider_create_lock:{tenant_id}_{provider_id}_{CredentialType.OAUTH2.value}"
with redis_client.lock(lock, timeout=20):
target_provider = (
session.query(DatasourceProvider).filter_by(id=credential_id, tenant_id=tenant_id).first()
)
if target_provider is None:
raise ValueError("provider not found")
db_provider_name = name
if not db_provider_name:
db_provider_name = target_provider.name
else:
name_conflict = (
session.query(DatasourceProvider)
.filter_by(
tenant_id=tenant_id,
name=db_provider_name,
provider=provider_id.provider_name,
plugin_id=provider_id.plugin_id,
auth_type=CredentialType.OAUTH2.value,
)
.count()
)
if name_conflict > 0:
db_provider_name = generate_incremental_name(
[
provider.name
for provider in session.query(DatasourceProvider).filter_by(
tenant_id=tenant_id,
provider=provider_id.provider_name,
plugin_id=provider_id.plugin_id,
)
],
db_provider_name,
)
provider_credential_secret_variables = self.extract_secret_variables(
tenant_id=tenant_id, provider_id=f"{provider_id}", credential_type=CredentialType.OAUTH2
)
for key, value in credentials.items():
if key in provider_credential_secret_variables:
credentials[key] = encrypter.encrypt_token(tenant_id, value)
target_provider.expires_at = expire_at
target_provider.encrypted_credentials = credentials
target_provider.avatar_url = avatar_url or target_provider.avatar_url
session.commit()
def add_datasource_oauth_provider(
self,
name: str | None,
tenant_id: str,
provider_id: DatasourceProviderID,
avatar_url: str | None,
expire_at: int,
credentials: dict,
) -> None:
"""
add datasource oauth provider
"""
credential_type = CredentialType.OAUTH2
with Session(db.engine) as session:
lock = f"datasource_provider_create_lock:{tenant_id}_{provider_id}_{credential_type.value}"
with redis_client.lock(lock, timeout=60):
db_provider_name = name
if not db_provider_name:
db_provider_name = self.generate_next_datasource_provider_name(
session=session,
tenant_id=tenant_id,
provider_id=provider_id,
credential_type=credential_type,
)
else:
if (
session.query(DatasourceProvider)
.filter_by(
tenant_id=tenant_id,
name=db_provider_name,
provider=provider_id.provider_name,
plugin_id=provider_id.plugin_id,
auth_type=credential_type.value,
)
.count()
> 0
):
db_provider_name = generate_incremental_name(
[
provider.name
for provider in session.query(DatasourceProvider).filter_by(
tenant_id=tenant_id,
provider=provider_id.provider_name,
plugin_id=provider_id.plugin_id,
)
],
db_provider_name,
)
provider_credential_secret_variables = self.extract_secret_variables(
tenant_id=tenant_id, provider_id=f"{provider_id}", credential_type=credential_type
)
for key, value in credentials.items():
if key in provider_credential_secret_variables:
credentials[key] = encrypter.encrypt_token(tenant_id, value)
datasource_provider = DatasourceProvider(
tenant_id=tenant_id,
name=db_provider_name,
provider=provider_id.provider_name,
plugin_id=provider_id.plugin_id,
auth_type=credential_type.value,
encrypted_credentials=credentials,
avatar_url=avatar_url or "default",
expires_at=expire_at,
)
session.add(datasource_provider)
session.commit()
def add_datasource_api_key_provider(
self,
name: str | None,
tenant_id: str,
provider_id: DatasourceProviderID,
credentials: dict,
) -> None:
"""
validate datasource provider credentials.
:param tenant_id:
:param provider:
:param credentials:
"""
provider_name = provider_id.provider_name
plugin_id = provider_id.plugin_id
with Session(db.engine) as session:
lock = f"datasource_provider_create_lock:{tenant_id}_{provider_id}_{CredentialType.API_KEY}"
with redis_client.lock(lock, timeout=20):
db_provider_name = name or self.generate_next_datasource_provider_name(
session=session,
tenant_id=tenant_id,
provider_id=provider_id,
credential_type=CredentialType.API_KEY,
)
# check name is exist
if (
session.query(DatasourceProvider)
.filter_by(tenant_id=tenant_id, plugin_id=plugin_id, provider=provider_name, name=db_provider_name)
.count()
> 0
):
raise ValueError("Authorization name is already exists")
try:
self.provider_manager.validate_provider_credentials(
tenant_id=tenant_id,
user_id=current_user.id,
provider=provider_name,
plugin_id=plugin_id,
credentials=credentials,
)
except Exception as e:
raise ValueError(f"Failed to validate credentials: {str(e)}")
provider_credential_secret_variables = self.extract_secret_variables(
tenant_id=tenant_id, provider_id=f"{provider_id}", credential_type=CredentialType.API_KEY
)
for key, value in credentials.items():
if key in provider_credential_secret_variables:
# if send [__HIDDEN__] in secret input, it will be same as original value
credentials[key] = encrypter.encrypt_token(tenant_id, value)
datasource_provider = DatasourceProvider(
tenant_id=tenant_id,
name=db_provider_name,
provider=provider_name,
plugin_id=plugin_id,
auth_type=CredentialType.API_KEY.value,
encrypted_credentials=credentials,
)
session.add(datasource_provider)
session.commit()
def extract_secret_variables(self, tenant_id: str, provider_id: str, credential_type: CredentialType) -> list[str]:
"""
Extract secret input form variables.
:param credential_form_schemas:
:return:
"""
datasource_provider = self.provider_manager.fetch_datasource_provider(
tenant_id=tenant_id, provider_id=provider_id
)
credential_form_schemas = []
if credential_type == CredentialType.API_KEY:
credential_form_schemas = list(datasource_provider.declaration.credentials_schema)
elif credential_type == CredentialType.OAUTH2:
if not datasource_provider.declaration.oauth_schema:
raise ValueError("Datasource provider oauth schema not found")
credential_form_schemas = list(datasource_provider.declaration.oauth_schema.credentials_schema)
else:
raise ValueError(f"Invalid credential type: {credential_type}")
secret_input_form_variables = []
for credential_form_schema in credential_form_schemas:
if credential_form_schema.type.value == FormType.SECRET_INPUT.value:
secret_input_form_variables.append(credential_form_schema.name)
return secret_input_form_variables
def list_datasource_credentials(self, tenant_id: str, provider: str, plugin_id: str) -> list[dict]:
"""
list datasource credentials with obfuscated sensitive fields.
:param tenant_id: workspace id
:param provider_id: provider id
:return:
"""
# Get all provider configurations of the current workspace
datasource_providers: list[DatasourceProvider] = (
db.session.query(DatasourceProvider)
.where(
DatasourceProvider.tenant_id == tenant_id,
DatasourceProvider.provider == provider,
DatasourceProvider.plugin_id == plugin_id,
)
.all()
)
if not datasource_providers:
return []
copy_credentials_list = []
default_provider = (
db.session.query(DatasourceProvider.id)
.filter_by(tenant_id=tenant_id, provider=provider, plugin_id=plugin_id)
.order_by(DatasourceProvider.is_default.desc(), DatasourceProvider.created_at.asc())
.first()
)
default_provider_id = default_provider.id if default_provider else None
for datasource_provider in datasource_providers:
encrypted_credentials = datasource_provider.encrypted_credentials
# Get provider credential secret variables
credential_secret_variables = self.extract_secret_variables(
tenant_id=tenant_id,
provider_id=f"{plugin_id}/{provider}",
credential_type=CredentialType.of(datasource_provider.auth_type),
)
# Obfuscate provider credentials
copy_credentials = encrypted_credentials.copy()
for key, value in copy_credentials.items():
if key in credential_secret_variables:
copy_credentials[key] = encrypter.obfuscated_token(value)
copy_credentials_list.append(
{
"credential": copy_credentials,
"type": datasource_provider.auth_type,
"name": datasource_provider.name,
"avatar_url": datasource_provider.avatar_url,
"id": datasource_provider.id,
"is_default": default_provider_id and datasource_provider.id == default_provider_id,
}
)
return copy_credentials_list
def get_all_datasource_credentials(self, tenant_id: str) -> list[dict]:
"""
get datasource credentials.
:return:
"""
# get all plugin providers
manager = PluginDatasourceManager()
datasources = manager.fetch_installed_datasource_providers(tenant_id)
datasource_credentials = []
for datasource in datasources:
datasource_provider_id = DatasourceProviderID(f"{datasource.plugin_id}/{datasource.provider}")
credentials = self.list_datasource_credentials(
tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
)
redirect_uri = (
f"{dify_config.CONSOLE_API_URL}/console/api/oauth/plugin/{datasource_provider_id}/datasource/callback"
)
datasource_credentials.append(
{
"provider": datasource.provider,
"plugin_id": datasource.plugin_id,
"plugin_unique_identifier": datasource.plugin_unique_identifier,
"icon": datasource.declaration.identity.icon,
"name": datasource.declaration.identity.name.split("/")[-1],
"label": datasource.declaration.identity.label.model_dump(),
"description": datasource.declaration.identity.description.model_dump(),
"author": datasource.declaration.identity.author,
"credentials_list": credentials,
"credential_schema": [
credential.model_dump() for credential in datasource.declaration.credentials_schema
],
"oauth_schema": {
"client_schema": [
client_schema.model_dump()
for client_schema in datasource.declaration.oauth_schema.client_schema
],
"credentials_schema": [
credential_schema.model_dump()
for credential_schema in datasource.declaration.oauth_schema.credentials_schema
],
"oauth_custom_client_params": self.get_tenant_oauth_client(
tenant_id, datasource_provider_id, mask=True
),
"is_oauth_custom_client_enabled": self.is_tenant_oauth_params_enabled(
tenant_id, datasource_provider_id
),
"is_system_oauth_params_exists": self.is_system_oauth_params_exist(datasource_provider_id),
"redirect_uri": redirect_uri,
}
if datasource.declaration.oauth_schema
else None,
}
)
return datasource_credentials
def get_hard_code_datasource_credentials(self, tenant_id: str) -> list[dict]:
"""
get hard code datasource credentials.
:return:
"""
# get all plugin providers
manager = PluginDatasourceManager()
datasources = manager.fetch_installed_datasource_providers(tenant_id)
datasource_credentials = []
for datasource in datasources:
if datasource.plugin_id in [
"langgenius/firecrawl_datasource",
"langgenius/notion_datasource",
"langgenius/jina_datasource",
]:
datasource_provider_id = DatasourceProviderID(f"{datasource.plugin_id}/{datasource.provider}")
credentials = self.list_datasource_credentials(
tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
)
redirect_uri = "{}/console/api/oauth/plugin/{}/datasource/callback".format(
dify_config.CONSOLE_API_URL, datasource_provider_id
)
datasource_credentials.append(
{
"provider": datasource.provider,
"plugin_id": datasource.plugin_id,
"plugin_unique_identifier": datasource.plugin_unique_identifier,
"icon": datasource.declaration.identity.icon,
"name": datasource.declaration.identity.name.split("/")[-1],
"label": datasource.declaration.identity.label.model_dump(),
"description": datasource.declaration.identity.description.model_dump(),
"author": datasource.declaration.identity.author,
"credentials_list": credentials,
"credential_schema": [
credential.model_dump() for credential in datasource.declaration.credentials_schema
],
"oauth_schema": {
"client_schema": [
client_schema.model_dump()
for client_schema in datasource.declaration.oauth_schema.client_schema
],
"credentials_schema": [
credential_schema.model_dump()
for credential_schema in datasource.declaration.oauth_schema.credentials_schema
],
"oauth_custom_client_params": self.get_tenant_oauth_client(
tenant_id, datasource_provider_id, mask=True
),
"is_oauth_custom_client_enabled": self.is_tenant_oauth_params_enabled(
tenant_id, datasource_provider_id
),
"is_system_oauth_params_exists": self.is_system_oauth_params_exist(datasource_provider_id),
"redirect_uri": redirect_uri,
}
if datasource.declaration.oauth_schema
else None,
}
)
return datasource_credentials
def get_real_datasource_credentials(self, tenant_id: str, provider: str, plugin_id: str) -> list[dict]:
"""
get datasource credentials.
:param tenant_id: workspace id
:param provider_id: provider id
:return:
"""
# Get all provider configurations of the current workspace
datasource_providers: list[DatasourceProvider] = (
db.session.query(DatasourceProvider)
.where(
DatasourceProvider.tenant_id == tenant_id,
DatasourceProvider.provider == provider,
DatasourceProvider.plugin_id == plugin_id,
)
.all()
)
if not datasource_providers:
return []
copy_credentials_list = []
for datasource_provider in datasource_providers:
encrypted_credentials = datasource_provider.encrypted_credentials
# Get provider credential secret variables
credential_secret_variables = self.extract_secret_variables(
tenant_id=tenant_id,
provider_id=f"{plugin_id}/{provider}",
credential_type=CredentialType.of(datasource_provider.auth_type),
)
# Obfuscate provider credentials
copy_credentials = encrypted_credentials.copy()
for key, value in copy_credentials.items():
if key in credential_secret_variables:
copy_credentials[key] = encrypter.decrypt_token(tenant_id, value)
copy_credentials_list.append(
{
"credentials": copy_credentials,
"type": datasource_provider.auth_type,
}
)
return copy_credentials_list
def update_datasource_credentials(
self, tenant_id: str, auth_id: str, provider: str, plugin_id: str, credentials: dict | None, name: str | None
) -> None:
"""
update datasource credentials.
"""
with Session(db.engine) as session:
datasource_provider = (
session.query(DatasourceProvider)
.filter_by(tenant_id=tenant_id, id=auth_id, provider=provider, plugin_id=plugin_id)
.first()
)
if not datasource_provider:
raise ValueError("Datasource provider not found")
# update name
if name and name != datasource_provider.name:
if (
session.query(DatasourceProvider)
.filter_by(tenant_id=tenant_id, name=name, provider=provider, plugin_id=plugin_id)
.count()
> 0
):
raise ValueError("Authorization name is already exists")
datasource_provider.name = name
# update credentials
if credentials:
secret_variables = self.extract_secret_variables(
tenant_id=tenant_id,
provider_id=f"{plugin_id}/{provider}",
credential_type=CredentialType.of(datasource_provider.auth_type),
)
original_credentials = {
key: value if key not in secret_variables else encrypter.decrypt_token(tenant_id, value)
for key, value in datasource_provider.encrypted_credentials.items()
}
new_credentials = {
key: value if value != HIDDEN_VALUE else original_credentials.get(key, UNKNOWN_VALUE)
for key, value in credentials.items()
}
try:
self.provider_manager.validate_provider_credentials(
tenant_id=tenant_id,
user_id=current_user.id,
provider=provider,
plugin_id=plugin_id,
credentials=new_credentials,
)
except Exception as e:
raise ValueError(f"Failed to validate credentials: {str(e)}")
encrypted_credentials = {}
for key, value in new_credentials.items():
if key in secret_variables:
encrypted_credentials[key] = encrypter.encrypt_token(tenant_id, value)
else:
encrypted_credentials[key] = value
datasource_provider.encrypted_credentials = encrypted_credentials
session.commit()
def remove_datasource_credentials(self, tenant_id: str, auth_id: str, provider: str, plugin_id: str) -> None:
"""
remove datasource credentials.
:param tenant_id: workspace id
:param provider: provider name
:param plugin_id: plugin id
:return:
"""
datasource_provider = (
db.session.query(DatasourceProvider)
.filter_by(tenant_id=tenant_id, id=auth_id, provider=provider, plugin_id=plugin_id)
.first()
)
if datasource_provider:
db.session.delete(datasource_provider)
db.session.commit()

View file

@ -49,7 +49,7 @@ class PluginManagerService:
if not ret.get("result", False):
raise CredentialPolicyViolationError("Credentials not available: Please use ENTERPRISE global credentials")
logger.debug(
logging.debug(
"Credential policy compliance checked for %s with credential %s, result: %s",
body.provider,
body.dify_credential_id,

View file

@ -23,6 +23,7 @@ class NotionPage(BaseModel):
class NotionInfo(BaseModel):
credential_id: str
workspace_id: str
pages: list[NotionPage]

View file

@ -0,0 +1,130 @@
from typing import Literal
from pydantic import BaseModel, field_validator
class IconInfo(BaseModel):
icon: str
icon_background: str | None = None
icon_type: str | None = None
icon_url: str | None = None
class PipelineTemplateInfoEntity(BaseModel):
name: str
description: str
icon_info: IconInfo
class RagPipelineDatasetCreateEntity(BaseModel):
name: str
description: str
icon_info: IconInfo
permission: str
partial_member_list: list[str] | None = None
yaml_content: str | None = None
class RerankingModelConfig(BaseModel):
"""
Reranking Model Config.
"""
reranking_provider_name: str | None = ""
reranking_model_name: str | None = ""
class VectorSetting(BaseModel):
"""
Vector Setting.
"""
vector_weight: float
embedding_provider_name: str
embedding_model_name: str
class KeywordSetting(BaseModel):
"""
Keyword Setting.
"""
keyword_weight: float
class WeightedScoreConfig(BaseModel):
"""
Weighted score Config.
"""
vector_setting: VectorSetting | None
keyword_setting: KeywordSetting | None
class EmbeddingSetting(BaseModel):
"""
Embedding Setting.
"""
embedding_provider_name: str
embedding_model_name: str
class EconomySetting(BaseModel):
"""
Economy Setting.
"""
keyword_number: int
class RetrievalSetting(BaseModel):
"""
Retrieval Setting.
"""
search_method: Literal["semantic_search", "fulltext_search", "keyword_search", "hybrid_search"]
top_k: int
score_threshold: float | None = 0.5
score_threshold_enabled: bool = False
reranking_mode: str | None = "reranking_model"
reranking_enable: bool | None = True
reranking_model: RerankingModelConfig | None = None
weights: WeightedScoreConfig | None = None
class IndexMethod(BaseModel):
"""
Knowledge Index Setting.
"""
indexing_technique: Literal["high_quality", "economy"]
embedding_setting: EmbeddingSetting
economy_setting: EconomySetting
class KnowledgeConfiguration(BaseModel):
"""
Knowledge Base Configuration.
"""
chunk_structure: str
indexing_technique: Literal["high_quality", "economy"]
embedding_model_provider: str = ""
embedding_model: str = ""
keyword_number: int | None = 10
retrieval_model: RetrievalSetting
@field_validator("embedding_model_provider", mode="before")
@classmethod
def validate_embedding_model_provider(cls, v):
if v is None:
return ""
return v
@field_validator("embedding_model", mode="before")
@classmethod
def validate_embedding_model(cls, v):
if v is None:
return ""
return v

View file

@ -88,6 +88,10 @@ class WebAppAuthModel(BaseModel):
allow_email_password_login: bool = False
class KnowledgePipeline(BaseModel):
publish_enabled: bool = False
class PluginInstallationScope(StrEnum):
NONE = "none"
OFFICIAL_ONLY = "official_only"
@ -126,6 +130,7 @@ class FeatureModel(BaseModel):
is_allow_transfer_workspace: bool = True
# pydantic configs
model_config = ConfigDict(protected_namespaces=())
knowledge_pipeline: KnowledgePipeline = KnowledgePipeline()
class KnowledgeRateLimitModel(BaseModel):
@ -271,6 +276,9 @@ class FeatureService:
if "knowledge_rate_limit" in billing_info:
features.knowledge_rate_limit = billing_info["knowledge_rate_limit"]["limit"]
if "knowledge_pipeline_publish_enabled" in billing_info:
features.knowledge_pipeline.publish_enabled = billing_info["knowledge_pipeline_publish_enabled"]
@classmethod
def _fulfill_params_from_enterprise(cls, features: SystemFeatureModel):
enterprise_info = EnterpriseService.get_info()

View file

@ -3,6 +3,8 @@ import os
import uuid
from typing import Literal, Union
from sqlalchemy import Engine
from sqlalchemy.orm import sessionmaker
from werkzeug.exceptions import NotFound
from configs import dify_config
@ -14,11 +16,9 @@ from constants import (
)
from core.file import helpers as file_helpers
from core.rag.extractor.extract_processor import ExtractProcessor
from extensions.ext_database import db
from extensions.ext_storage import storage
from libs.datetime_utils import naive_utc_now
from libs.helper import extract_tenant_id
from libs.login import current_user
from models.account import Account
from models.enums import CreatorUserRole
from models.model import EndUser, UploadFile
@ -29,8 +29,18 @@ PREVIEW_WORDS_LIMIT = 3000
class FileService:
@staticmethod
_session_maker: sessionmaker
def __init__(self, session_factory: sessionmaker | Engine | None = None):
if isinstance(session_factory, Engine):
self._session_maker = sessionmaker(bind=session_factory)
elif isinstance(session_factory, sessionmaker):
self._session_maker = session_factory
else:
raise AssertionError("must be a sessionmaker or an Engine.")
def upload_file(
self,
*,
filename: str,
content: bytes,
@ -85,14 +95,14 @@ class FileService:
hash=hashlib.sha3_256(content).hexdigest(),
source_url=source_url,
)
db.session.add(upload_file)
db.session.commit()
# The `UploadFile` ID is generated within its constructor, so flushing to retrieve the ID is unnecessary.
# We can directly generate the `source_url` here before committing.
if not upload_file.source_url:
upload_file.source_url = file_helpers.get_signed_file_url(upload_file_id=upload_file.id)
db.session.add(upload_file)
db.session.commit()
with self._session_maker(expire_on_commit=False) as session:
session.add(upload_file)
session.commit()
return upload_file
@ -109,45 +119,42 @@ class FileService:
return file_size <= file_size_limit
@staticmethod
def upload_text(text: str, text_name: str) -> UploadFile:
assert isinstance(current_user, Account)
assert current_user.current_tenant_id is not None
def upload_text(self, text: str, text_name: str, user_id: str, tenant_id: str) -> UploadFile:
if len(text_name) > 200:
text_name = text_name[:200]
# user uuid as file name
file_uuid = str(uuid.uuid4())
file_key = "upload_files/" + current_user.current_tenant_id + "/" + file_uuid + ".txt"
file_key = "upload_files/" + tenant_id + "/" + file_uuid + ".txt"
# save file to storage
storage.save(file_key, text.encode("utf-8"))
# save file to db
upload_file = UploadFile(
tenant_id=current_user.current_tenant_id,
tenant_id=tenant_id,
storage_type=dify_config.STORAGE_TYPE,
key=file_key,
name=text_name,
size=len(text),
extension="txt",
mime_type="text/plain",
created_by=current_user.id,
created_by=user_id,
created_by_role=CreatorUserRole.ACCOUNT,
created_at=naive_utc_now(),
used=True,
used_by=current_user.id,
used_by=user_id,
used_at=naive_utc_now(),
)
db.session.add(upload_file)
db.session.commit()
with self._session_maker(expire_on_commit=False) as session:
session.add(upload_file)
session.commit()
return upload_file
@staticmethod
def get_file_preview(file_id: str):
upload_file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
def get_file_preview(self, file_id: str):
with self._session_maker(expire_on_commit=False) as session:
upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first()
if not upload_file:
raise NotFound("File not found")
@ -162,15 +169,14 @@ class FileService:
return text
@staticmethod
def get_image_preview(file_id: str, timestamp: str, nonce: str, sign: str):
def get_image_preview(self, file_id: str, timestamp: str, nonce: str, sign: str):
result = file_helpers.verify_image_signature(
upload_file_id=file_id, timestamp=timestamp, nonce=nonce, sign=sign
)
if not result:
raise NotFound("File not found or signature is invalid")
upload_file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
with self._session_maker(expire_on_commit=False) as session:
upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first()
if not upload_file:
raise NotFound("File not found or signature is invalid")
@ -184,13 +190,13 @@ class FileService:
return generator, upload_file.mime_type
@staticmethod
def get_file_generator_by_file_id(file_id: str, timestamp: str, nonce: str, sign: str):
def get_file_generator_by_file_id(self, file_id: str, timestamp: str, nonce: str, sign: str):
result = file_helpers.verify_file_signature(upload_file_id=file_id, timestamp=timestamp, nonce=nonce, sign=sign)
if not result:
raise NotFound("File not found or signature is invalid")
upload_file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
with self._session_maker(expire_on_commit=False) as session:
upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first()
if not upload_file:
raise NotFound("File not found or signature is invalid")
@ -199,9 +205,9 @@ class FileService:
return generator, upload_file
@staticmethod
def get_public_image_preview(file_id: str):
upload_file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
def get_public_image_preview(self, file_id: str):
with self._session_maker(expire_on_commit=False) as session:
upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first()
if not upload_file:
raise NotFound("File not found or signature is invalid")
@ -214,3 +220,23 @@ class FileService:
generator = storage.load(upload_file.key)
return generator, upload_file.mime_type
def get_file_content(self, file_id: str) -> str:
with self._session_maker(expire_on_commit=False) as session:
upload_file: UploadFile | None = session.query(UploadFile).where(UploadFile.id == file_id).first()
if not upload_file:
raise NotFound("File not found")
content = storage.load(upload_file.key)
return content.decode("utf-8")
def delete_file(self, file_id: str):
with self._session_maker(expire_on_commit=False) as session:
upload_file: UploadFile | None = session.query(UploadFile).where(UploadFile.id == file_id).first()
if not upload_file:
return
storage.delete(upload_file.key)
session.delete(upload_file)
session.commit()

View file

@ -241,6 +241,9 @@ class MessageService:
app_config = AdvancedChatAppConfigManager.get_app_config(app_model=app_model, workflow=workflow)
if not app_config.additional_features:
raise ValueError("Additional features not found")
if not app_config.additional_features.suggested_questions_after_answer:
raise SuggestedQuestionsAfterAnswerDisabledError()

View file

@ -4,8 +4,8 @@ import logging
import click
import sqlalchemy as sa
from core.plugin.entities.plugin import GenericProviderID, ModelProviderID, ToolProviderID
from models.engine import db
from extensions.ext_database import db
from models.provider_ids import GenericProviderID, ModelProviderID, ToolProviderID
logger = logging.getLogger(__name__)

View file

@ -1,7 +1,13 @@
import re
from configs import dify_config
from core.helper import marketplace
from core.plugin.entities.plugin import ModelProviderID, PluginDependency, PluginInstallationSource, ToolProviderID
from core.plugin.entities.plugin import PluginDependency, PluginInstallationSource
from core.plugin.impl.plugin import PluginInstaller
from models.provider_ids import ModelProviderID, ToolProviderID
# Compile regex pattern for version extraction at module level for better performance
_VERSION_REGEX = re.compile(r":(?P<version>[0-9]+(?:\.[0-9]+){2}(?:[+-][0-9A-Za-z.-]+)?)(?:@|$)")
class DependenciesAnalysisService:
@ -48,6 +54,13 @@ class DependenciesAnalysisService:
for dependency in dependencies:
unique_identifier = dependency.value.plugin_unique_identifier
if unique_identifier in missing_plugin_unique_identifiers:
# Extract version for Marketplace dependencies
if dependency.type == PluginDependency.Type.Marketplace:
version_match = _VERSION_REGEX.search(unique_identifier)
if version_match:
dependency.value.version = version_match.group("version")
# Create and append the dependency (same for all types)
leaked_dependencies.append(
PluginDependency(
type=dependency.type,

View file

@ -11,7 +11,13 @@ class OAuthProxyService(BasePluginClient):
__KEY_PREFIX__ = "oauth_proxy_context:"
@staticmethod
def create_proxy_context(user_id: str, tenant_id: str, plugin_id: str, provider: str):
def create_proxy_context(
user_id: str,
tenant_id: str,
plugin_id: str,
provider: str,
credential_id: str | None = None,
):
"""
Create a proxy context for an OAuth 2.0 authorization request.
@ -31,6 +37,8 @@ class OAuthProxyService(BasePluginClient):
"tenant_id": tenant_id,
"provider": provider,
}
if credential_id:
data["credential_id"] = credential_id
redis_client.setex(
f"{OAuthProxyService.__KEY_PREFIX__}{context_id}",
OAuthProxyService.__MAX_AGE__,

View file

@ -16,15 +16,17 @@ from sqlalchemy.orm import Session
from core.agent.entities import AgentToolEntity
from core.helper import marketplace
from core.plugin.entities.plugin import ModelProviderID, PluginInstallationSource, ToolProviderID
from core.plugin.entities.plugin import PluginInstallationSource
from core.plugin.entities.plugin_daemon import PluginInstallTaskStatus
from core.plugin.impl.plugin import PluginInstaller
from core.tools.entities.tool_entities import ToolProviderType
from extensions.ext_database import db
from models.account import Tenant
from models.engine import db
from models.model import App, AppMode, AppModelConfig
from models.provider_ids import ModelProviderID, ToolProviderID
from models.tools import BuiltinToolProvider
from models.workflow import Workflow
from services.plugin.plugin_service import PluginService
logger = logging.getLogger(__name__)
@ -421,6 +423,94 @@ class PluginMigration:
)
)
@classmethod
def install_rag_pipeline_plugins(cls, extracted_plugins: str, output_file: str, workers: int = 100) -> None:
"""
Install rag pipeline plugins.
"""
manager = PluginInstaller()
plugins = cls.extract_unique_plugins(extracted_plugins)
plugin_install_failed = []
# use a fake tenant id to install all the plugins
fake_tenant_id = uuid4().hex
logger.info("Installing %s plugin instances for fake tenant %s", len(plugins["plugins"]), fake_tenant_id)
thread_pool = ThreadPoolExecutor(max_workers=workers)
response = cls.handle_plugin_instance_install(fake_tenant_id, plugins["plugins"])
if response.get("failed"):
plugin_install_failed.extend(response.get("failed", []))
def install(
tenant_id: str, plugin_ids: dict[str, str], total_success_tenant: int, total_failed_tenant: int
) -> None:
logger.info("Installing %s plugins for tenant %s", len(plugin_ids), tenant_id)
try:
# fetch plugin already installed
installed_plugins = manager.list_plugins(tenant_id)
installed_plugins_ids = [plugin.plugin_id for plugin in installed_plugins]
# at most 64 plugins one batch
for i in range(0, len(plugin_ids), 64):
batch_plugin_ids = list(plugin_ids.keys())[i : i + 64]
batch_plugin_identifiers = [
plugin_ids[plugin_id]
for plugin_id in batch_plugin_ids
if plugin_id not in installed_plugins_ids and plugin_id in plugin_ids
]
PluginService.install_from_marketplace_pkg(tenant_id, batch_plugin_identifiers)
total_success_tenant += 1
except Exception:
logger.exception("Failed to install plugins for tenant %s", tenant_id)
total_failed_tenant += 1
page = 1
total_success_tenant = 0
total_failed_tenant = 0
while True:
# paginate
tenants = db.paginate(db.select(Tenant).order_by(Tenant.created_at.desc()), page=page, per_page=100)
if tenants.items is None or len(tenants.items) == 0:
break
for tenant in tenants:
tenant_id = tenant.id
# get plugin unique identifier
thread_pool.submit(
install,
tenant_id,
plugins.get("plugins", {}),
total_success_tenant,
total_failed_tenant,
)
page += 1
thread_pool.shutdown(wait=True)
# uninstall all the plugins for fake tenant
try:
installation = manager.list_plugins(fake_tenant_id)
while installation:
for plugin in installation:
manager.uninstall(fake_tenant_id, plugin.installation_id)
installation = manager.list_plugins(fake_tenant_id)
except Exception:
logger.exception("Failed to get installation for tenant %s", fake_tenant_id)
Path(output_file).write_text(
json.dumps(
{
"total_success_tenant": total_success_tenant,
"total_failed_tenant": total_failed_tenant,
"plugin_install_failed": plugin_install_failed,
}
)
)
@classmethod
def handle_plugin_instance_install(
cls, tenant_id: str, plugin_identifiers_map: Mapping[str, str]

View file

@ -10,7 +10,6 @@ from core.helper.download import download_with_size_limit
from core.helper.marketplace import download_plugin_pkg
from core.plugin.entities.bundle import PluginBundleDependency
from core.plugin.entities.plugin import (
GenericProviderID,
PluginDeclaration,
PluginEntity,
PluginInstallation,
@ -26,6 +25,7 @@ from core.plugin.impl.asset import PluginAssetManager
from core.plugin.impl.debugging import PluginDebuggingClient
from core.plugin.impl.plugin import PluginInstaller
from extensions.ext_redis import redis_client
from models.provider_ids import GenericProviderID
from services.errors.plugin import PluginInstallationForbiddenError
from services.feature_service import FeatureService, PluginInstallationScope

View file

@ -0,0 +1,22 @@
from collections.abc import Mapping
from typing import Any
from pydantic import BaseModel
class DatasourceNodeRunApiEntity(BaseModel):
pipeline_id: str
node_id: str
inputs: dict[str, Any]
datasource_type: str
credential_id: str | None = None
is_published: bool
class PipelineRunApiEntity(BaseModel):
inputs: Mapping[str, Any]
datasource_type: str
datasource_info_list: list[Mapping[str, Any]]
start_node_id: str
is_published: bool
response_mode: str

View file

@ -0,0 +1,115 @@
from collections.abc import Mapping
from typing import Any, Union
from configs import dify_config
from core.app.apps.pipeline.pipeline_generator import PipelineGenerator
from core.app.entities.app_invoke_entities import InvokeFrom
from extensions.ext_database import db
from models.dataset import Document, Pipeline
from models.model import Account, App, EndUser
from models.workflow import Workflow
from services.rag_pipeline.rag_pipeline import RagPipelineService
class PipelineGenerateService:
@classmethod
def generate(
cls,
pipeline: Pipeline,
user: Union[Account, EndUser],
args: Mapping[str, Any],
invoke_from: InvokeFrom,
streaming: bool = True,
):
"""
Pipeline Content Generate
:param pipeline: pipeline
:param user: user
:param args: args
:param invoke_from: invoke from
:param streaming: streaming
:return:
"""
try:
workflow = cls._get_workflow(pipeline, invoke_from)
if original_document_id := args.get("original_document_id"):
# update document status to waiting
cls.update_document_status(original_document_id)
return PipelineGenerator.convert_to_event_stream(
PipelineGenerator().generate(
pipeline=pipeline,
workflow=workflow,
user=user,
args=args,
invoke_from=invoke_from,
streaming=streaming,
call_depth=0,
workflow_thread_pool_id=None,
),
)
except Exception:
raise
@staticmethod
def _get_max_active_requests(app_model: App) -> int:
max_active_requests = app_model.max_active_requests
if max_active_requests is None:
max_active_requests = int(dify_config.APP_MAX_ACTIVE_REQUESTS)
return max_active_requests
@classmethod
def generate_single_iteration(
cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True
):
workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
return PipelineGenerator.convert_to_event_stream(
PipelineGenerator().single_iteration_generate(
pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
)
)
@classmethod
def generate_single_loop(cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True):
workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
return PipelineGenerator.convert_to_event_stream(
PipelineGenerator().single_loop_generate(
pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
)
)
@classmethod
def _get_workflow(cls, pipeline: Pipeline, invoke_from: InvokeFrom) -> Workflow:
"""
Get workflow
:param pipeline: pipeline
:param invoke_from: invoke from
:return:
"""
rag_pipeline_service = RagPipelineService()
if invoke_from == InvokeFrom.DEBUGGER:
# fetch draft workflow by app_model
workflow = rag_pipeline_service.get_draft_workflow(pipeline=pipeline)
if not workflow:
raise ValueError("Workflow not initialized")
else:
# fetch published workflow by app_model
workflow = rag_pipeline_service.get_published_workflow(pipeline=pipeline)
if not workflow:
raise ValueError("Workflow not published")
return workflow
@classmethod
def update_document_status(cls, document_id: str):
"""
Update document status to waiting
:param document_id: document id
"""
document = db.session.query(Document).where(Document.id == document_id).first()
if document:
document.indexing_status = "waiting"
db.session.add(document)
db.session.commit()

View file

@ -0,0 +1,63 @@
import json
from os import path
from pathlib import Path
from flask import current_app
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
class BuiltInPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
"""
Retrieval pipeline template from built-in, the location is constants/pipeline_templates.json
"""
builtin_data: dict | None = None
def get_type(self) -> str:
return PipelineTemplateType.BUILTIN
def get_pipeline_templates(self, language: str) -> dict:
result = self.fetch_pipeline_templates_from_builtin(language)
return result
def get_pipeline_template_detail(self, template_id: str):
result = self.fetch_pipeline_template_detail_from_builtin(template_id)
return result
@classmethod
def _get_builtin_data(cls) -> dict:
"""
Get builtin data.
:return:
"""
if cls.builtin_data:
return cls.builtin_data
root_path = current_app.root_path
cls.builtin_data = json.loads(
Path(path.join(root_path, "constants", "pipeline_templates.json")).read_text(encoding="utf-8")
)
return cls.builtin_data or {}
@classmethod
def fetch_pipeline_templates_from_builtin(cls, language: str) -> dict:
"""
Fetch pipeline templates from builtin.
:param language: language
:return:
"""
builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
return builtin_data.get("pipeline_templates", {}).get(language, {})
@classmethod
def fetch_pipeline_template_detail_from_builtin(cls, template_id: str) -> dict | None:
"""
Fetch pipeline template detail from builtin.
:param template_id: Template ID
:return:
"""
builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
return builtin_data.get("pipeline_templates", {}).get(template_id)

View file

@ -0,0 +1,81 @@
import yaml
from flask_login import current_user
from extensions.ext_database import db
from models.dataset import PipelineCustomizedTemplate
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
class CustomizedPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
"""
Retrieval recommended app from database
"""
def get_pipeline_templates(self, language: str) -> dict:
result = self.fetch_pipeline_templates_from_customized(
tenant_id=current_user.current_tenant_id, language=language
)
return result
def get_pipeline_template_detail(self, template_id: str):
result = self.fetch_pipeline_template_detail_from_db(template_id)
return result
def get_type(self) -> str:
return PipelineTemplateType.CUSTOMIZED
@classmethod
def fetch_pipeline_templates_from_customized(cls, tenant_id: str, language: str) -> dict:
"""
Fetch pipeline templates from db.
:param tenant_id: tenant id
:param language: language
:return:
"""
pipeline_customized_templates = (
db.session.query(PipelineCustomizedTemplate)
.where(PipelineCustomizedTemplate.tenant_id == tenant_id, PipelineCustomizedTemplate.language == language)
.order_by(PipelineCustomizedTemplate.position.asc(), PipelineCustomizedTemplate.created_at.desc())
.all()
)
recommended_pipelines_results = []
for pipeline_customized_template in pipeline_customized_templates:
recommended_pipeline_result = {
"id": pipeline_customized_template.id,
"name": pipeline_customized_template.name,
"description": pipeline_customized_template.description,
"icon": pipeline_customized_template.icon,
"position": pipeline_customized_template.position,
"chunk_structure": pipeline_customized_template.chunk_structure,
}
recommended_pipelines_results.append(recommended_pipeline_result)
return {"pipeline_templates": recommended_pipelines_results}
@classmethod
def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
"""
Fetch pipeline template detail from db.
:param template_id: Template ID
:return:
"""
pipeline_template = (
db.session.query(PipelineCustomizedTemplate).where(PipelineCustomizedTemplate.id == template_id).first()
)
if not pipeline_template:
return None
dsl_data = yaml.safe_load(pipeline_template.yaml_content)
graph_data = dsl_data.get("workflow", {}).get("graph", {})
return {
"id": pipeline_template.id,
"name": pipeline_template.name,
"icon_info": pipeline_template.icon,
"description": pipeline_template.description,
"chunk_structure": pipeline_template.chunk_structure,
"export_data": pipeline_template.yaml_content,
"graph": graph_data,
"created_by": pipeline_template.created_user_name,
}

View file

@ -0,0 +1,78 @@
import yaml
from extensions.ext_database import db
from models.dataset import PipelineBuiltInTemplate
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
class DatabasePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
"""
Retrieval pipeline template from database
"""
def get_pipeline_templates(self, language: str) -> dict:
result = self.fetch_pipeline_templates_from_db(language)
return result
def get_pipeline_template_detail(self, template_id: str):
result = self.fetch_pipeline_template_detail_from_db(template_id)
return result
def get_type(self) -> str:
return PipelineTemplateType.DATABASE
@classmethod
def fetch_pipeline_templates_from_db(cls, language: str) -> dict:
"""
Fetch pipeline templates from db.
:param language: language
:return:
"""
pipeline_built_in_templates: list[PipelineBuiltInTemplate] = (
db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.language == language).all()
)
recommended_pipelines_results = []
for pipeline_built_in_template in pipeline_built_in_templates:
recommended_pipeline_result = {
"id": pipeline_built_in_template.id,
"name": pipeline_built_in_template.name,
"description": pipeline_built_in_template.description,
"icon": pipeline_built_in_template.icon,
"copyright": pipeline_built_in_template.copyright,
"privacy_policy": pipeline_built_in_template.privacy_policy,
"position": pipeline_built_in_template.position,
"chunk_structure": pipeline_built_in_template.chunk_structure,
}
recommended_pipelines_results.append(recommended_pipeline_result)
return {"pipeline_templates": recommended_pipelines_results}
@classmethod
def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
"""
Fetch pipeline template detail from db.
:param pipeline_id: Pipeline ID
:return:
"""
# is in public recommended list
pipeline_template = (
db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.id == template_id).first()
)
if not pipeline_template:
return None
dsl_data = yaml.safe_load(pipeline_template.yaml_content)
graph_data = dsl_data.get("workflow", {}).get("graph", {})
return {
"id": pipeline_template.id,
"name": pipeline_template.name,
"icon_info": pipeline_template.icon,
"description": pipeline_template.description,
"chunk_structure": pipeline_template.chunk_structure,
"export_data": pipeline_template.yaml_content,
"graph": graph_data,
"created_by": pipeline_template.created_user_name,
}

View file

@ -0,0 +1,17 @@
from abc import ABC, abstractmethod
class PipelineTemplateRetrievalBase(ABC):
"""Interface for pipeline template retrieval."""
@abstractmethod
def get_pipeline_templates(self, language: str) -> dict:
raise NotImplementedError
@abstractmethod
def get_pipeline_template_detail(self, template_id: str) -> dict | None:
raise NotImplementedError
@abstractmethod
def get_type(self) -> str:
raise NotImplementedError

View file

@ -0,0 +1,26 @@
from services.rag_pipeline.pipeline_template.built_in.built_in_retrieval import BuiltInPipelineTemplateRetrieval
from services.rag_pipeline.pipeline_template.customized.customized_retrieval import CustomizedPipelineTemplateRetrieval
from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
from services.rag_pipeline.pipeline_template.remote.remote_retrieval import RemotePipelineTemplateRetrieval
class PipelineTemplateRetrievalFactory:
@staticmethod
def get_pipeline_template_factory(mode: str) -> type[PipelineTemplateRetrievalBase]:
match mode:
case PipelineTemplateType.REMOTE:
return RemotePipelineTemplateRetrieval
case PipelineTemplateType.CUSTOMIZED:
return CustomizedPipelineTemplateRetrieval
case PipelineTemplateType.DATABASE:
return DatabasePipelineTemplateRetrieval
case PipelineTemplateType.BUILTIN:
return BuiltInPipelineTemplateRetrieval
case _:
raise ValueError(f"invalid fetch recommended apps mode: {mode}")
@staticmethod
def get_built_in_pipeline_template_retrieval():
return BuiltInPipelineTemplateRetrieval

View file

@ -0,0 +1,8 @@
from enum import StrEnum
class PipelineTemplateType(StrEnum):
REMOTE = "remote"
DATABASE = "database"
CUSTOMIZED = "customized"
BUILTIN = "builtin"

View file

@ -0,0 +1,67 @@
import logging
import requests
from configs import dify_config
from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
logger = logging.getLogger(__name__)
class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
"""
Retrieval recommended app from dify official
"""
def get_pipeline_template_detail(self, template_id: str):
try:
result = self.fetch_pipeline_template_detail_from_dify_official(template_id)
except Exception as e:
logger.warning("fetch recommended app detail from dify official failed: %r, switch to database.", e)
result = DatabasePipelineTemplateRetrieval.fetch_pipeline_template_detail_from_db(template_id)
return result
def get_pipeline_templates(self, language: str) -> dict:
try:
result = self.fetch_pipeline_templates_from_dify_official(language)
except Exception as e:
logger.warning("fetch pipeline templates from dify official failed: %r, switch to database.", e)
result = DatabasePipelineTemplateRetrieval.fetch_pipeline_templates_from_db(language)
return result
def get_type(self) -> str:
return PipelineTemplateType.REMOTE
@classmethod
def fetch_pipeline_template_detail_from_dify_official(cls, template_id: str) -> dict | None:
"""
Fetch pipeline template detail from dify official.
:param template_id: Pipeline ID
:return:
"""
domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
url = f"{domain}/pipeline-templates/{template_id}"
response = requests.get(url, timeout=(3, 10))
if response.status_code != 200:
return None
data: dict = response.json()
return data
@classmethod
def fetch_pipeline_templates_from_dify_official(cls, language: str) -> dict:
"""
Fetch pipeline templates from dify official.
:param language: language
:return:
"""
domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
url = f"{domain}/pipeline-templates?language={language}"
response = requests.get(url, timeout=(3, 10))
if response.status_code != 200:
raise ValueError(f"fetch pipeline templates failed, status code: {response.status_code}")
result: dict = response.json()
return result

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,932 @@
import base64
import hashlib
import json
import logging
import uuid
from collections.abc import Mapping
from datetime import UTC, datetime
from enum import StrEnum
from typing import cast
from urllib.parse import urlparse
from uuid import uuid4
import yaml # type: ignore
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad
from flask_login import current_user
from packaging import version
from pydantic import BaseModel, Field
from sqlalchemy import select
from sqlalchemy.orm import Session
from core.helper import ssrf_proxy
from core.helper.name_generator import generate_incremental_name
from core.model_runtime.utils.encoders import jsonable_encoder
from core.plugin.entities.plugin import PluginDependency
from core.workflow.enums import NodeType
from core.workflow.nodes.datasource.entities import DatasourceNodeData
from core.workflow.nodes.knowledge_retrieval.entities import KnowledgeRetrievalNodeData
from core.workflow.nodes.llm.entities import LLMNodeData
from core.workflow.nodes.parameter_extractor.entities import ParameterExtractorNodeData
from core.workflow.nodes.question_classifier.entities import QuestionClassifierNodeData
from core.workflow.nodes.tool.entities import ToolNodeData
from extensions.ext_redis import redis_client
from factories import variable_factory
from models import Account
from models.dataset import Dataset, DatasetCollectionBinding, Pipeline
from models.workflow import Workflow, WorkflowType
from services.entities.knowledge_entities.rag_pipeline_entities import (
IconInfo,
KnowledgeConfiguration,
RagPipelineDatasetCreateEntity,
)
from services.plugin.dependencies_analysis import DependenciesAnalysisService
logger = logging.getLogger(__name__)
IMPORT_INFO_REDIS_KEY_PREFIX = "app_import_info:"
CHECK_DEPENDENCIES_REDIS_KEY_PREFIX = "app_check_dependencies:"
IMPORT_INFO_REDIS_EXPIRY = 10 * 60 # 10 minutes
DSL_MAX_SIZE = 10 * 1024 * 1024 # 10MB
CURRENT_DSL_VERSION = "0.1.0"
class ImportMode(StrEnum):
YAML_CONTENT = "yaml-content"
YAML_URL = "yaml-url"
class ImportStatus(StrEnum):
COMPLETED = "completed"
COMPLETED_WITH_WARNINGS = "completed-with-warnings"
PENDING = "pending"
FAILED = "failed"
class RagPipelineImportInfo(BaseModel):
id: str
status: ImportStatus
pipeline_id: str | None = None
current_dsl_version: str = CURRENT_DSL_VERSION
imported_dsl_version: str = ""
error: str = ""
dataset_id: str | None = None
class CheckDependenciesResult(BaseModel):
leaked_dependencies: list[PluginDependency] = Field(default_factory=list)
def _check_version_compatibility(imported_version: str) -> ImportStatus:
"""Determine import status based on version comparison"""
try:
current_ver = version.parse(CURRENT_DSL_VERSION)
imported_ver = version.parse(imported_version)
except version.InvalidVersion:
return ImportStatus.FAILED
# If imported version is newer than current, always return PENDING
if imported_ver > current_ver:
return ImportStatus.PENDING
# If imported version is older than current's major, return PENDING
if imported_ver.major < current_ver.major:
return ImportStatus.PENDING
# If imported version is older than current's minor, return COMPLETED_WITH_WARNINGS
if imported_ver.minor < current_ver.minor:
return ImportStatus.COMPLETED_WITH_WARNINGS
# If imported version equals or is older than current's micro, return COMPLETED
return ImportStatus.COMPLETED
class RagPipelinePendingData(BaseModel):
import_mode: str
yaml_content: str
pipeline_id: str | None
class CheckDependenciesPendingData(BaseModel):
dependencies: list[PluginDependency]
pipeline_id: str | None
class RagPipelineDslService:
def __init__(self, session: Session):
self._session = session
def import_rag_pipeline(
self,
*,
account: Account,
import_mode: str,
yaml_content: str | None = None,
yaml_url: str | None = None,
pipeline_id: str | None = None,
dataset: Dataset | None = None,
dataset_name: str | None = None,
icon_info: IconInfo | None = None,
) -> RagPipelineImportInfo:
"""Import an app from YAML content or URL."""
import_id = str(uuid.uuid4())
# Validate import mode
try:
mode = ImportMode(import_mode)
except ValueError:
raise ValueError(f"Invalid import_mode: {import_mode}")
# Get YAML content
content: str = ""
if mode == ImportMode.YAML_URL:
if not yaml_url:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="yaml_url is required when import_mode is yaml-url",
)
try:
parsed_url = urlparse(yaml_url)
if (
parsed_url.scheme == "https"
and parsed_url.netloc == "github.com"
and parsed_url.path.endswith((".yml", ".yaml"))
):
yaml_url = yaml_url.replace("https://github.com", "https://raw.githubusercontent.com")
yaml_url = yaml_url.replace("/blob/", "/")
response = ssrf_proxy.get(yaml_url.strip(), follow_redirects=True, timeout=(10, 10))
response.raise_for_status()
content = response.content.decode()
if len(content) > DSL_MAX_SIZE:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="File size exceeds the limit of 10MB",
)
if not content:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Empty content from url",
)
except Exception as e:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error=f"Error fetching YAML from URL: {str(e)}",
)
elif mode == ImportMode.YAML_CONTENT:
if not yaml_content:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="yaml_content is required when import_mode is yaml-content",
)
content = yaml_content
# Process YAML content
try:
# Parse YAML to validate format
data = yaml.safe_load(content)
if not isinstance(data, dict):
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Invalid YAML format: content must be a mapping",
)
# Validate and fix DSL version
if not data.get("version"):
data["version"] = "0.1.0"
if not data.get("kind") or data.get("kind") != "rag_pipeline":
data["kind"] = "rag_pipeline"
imported_version = data.get("version", "0.1.0")
# check if imported_version is a float-like string
if not isinstance(imported_version, str):
raise ValueError(f"Invalid version type, expected str, got {type(imported_version)}")
status = _check_version_compatibility(imported_version)
# Extract app data
pipeline_data = data.get("rag_pipeline")
if not pipeline_data:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Missing rag_pipeline data in YAML content",
)
# If app_id is provided, check if it exists
pipeline = None
if pipeline_id:
stmt = select(Pipeline).where(
Pipeline.id == pipeline_id,
Pipeline.tenant_id == account.current_tenant_id,
)
pipeline = self._session.scalar(stmt)
if not pipeline:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Pipeline not found",
)
dataset = pipeline.retrieve_dataset(session=self._session)
# If major version mismatch, store import info in Redis
if status == ImportStatus.PENDING:
pending_data = RagPipelinePendingData(
import_mode=import_mode,
yaml_content=content,
pipeline_id=pipeline_id,
)
redis_client.setex(
f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}",
IMPORT_INFO_REDIS_EXPIRY,
pending_data.model_dump_json(),
)
return RagPipelineImportInfo(
id=import_id,
status=status,
pipeline_id=pipeline_id,
imported_dsl_version=imported_version,
)
# Extract dependencies
dependencies = data.get("dependencies", [])
check_dependencies_pending_data = None
if dependencies:
check_dependencies_pending_data = [PluginDependency.model_validate(d) for d in dependencies]
# Create or update pipeline
pipeline = self._create_or_update_pipeline(
pipeline=pipeline,
data=data,
account=account,
dependencies=check_dependencies_pending_data,
)
# create dataset
name = pipeline.name or "Untitled"
description = pipeline.description
if icon_info:
icon_type = icon_info.icon_type
icon = icon_info.icon
icon_background = icon_info.icon_background
icon_url = icon_info.icon_url
else:
icon_type = data.get("rag_pipeline", {}).get("icon_type")
icon = data.get("rag_pipeline", {}).get("icon")
icon_background = data.get("rag_pipeline", {}).get("icon_background")
icon_url = data.get("rag_pipeline", {}).get("icon_url")
workflow = data.get("workflow", {})
graph = workflow.get("graph", {})
nodes = graph.get("nodes", [])
dataset_id = None
for node in nodes:
if node.get("data", {}).get("type") == "knowledge-index":
knowledge_configuration = KnowledgeConfiguration(**node.get("data", {}))
if (
dataset
and pipeline.is_published
and dataset.chunk_structure != knowledge_configuration.chunk_structure
):
raise ValueError("Chunk structure is not compatible with the published pipeline")
if not dataset:
datasets = self._session.query(Dataset).filter_by(tenant_id=account.current_tenant_id).all()
names = [dataset.name for dataset in datasets]
generate_name = generate_incremental_name(names, name)
dataset = Dataset(
tenant_id=account.current_tenant_id,
name=generate_name,
description=description,
icon_info={
"icon_type": icon_type,
"icon": icon,
"icon_background": icon_background,
"icon_url": icon_url,
},
indexing_technique=knowledge_configuration.indexing_technique,
created_by=account.id,
retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
runtime_mode="rag_pipeline",
chunk_structure=knowledge_configuration.chunk_structure,
)
if knowledge_configuration.indexing_technique == "high_quality":
dataset_collection_binding = (
self._session.query(DatasetCollectionBinding)
.where(
DatasetCollectionBinding.provider_name
== knowledge_configuration.embedding_model_provider,
DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
DatasetCollectionBinding.type == "dataset",
)
.order_by(DatasetCollectionBinding.created_at)
.first()
)
if not dataset_collection_binding:
dataset_collection_binding = DatasetCollectionBinding(
provider_name=knowledge_configuration.embedding_model_provider,
model_name=knowledge_configuration.embedding_model,
collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
type="dataset",
)
self._session.add(dataset_collection_binding)
self._session.commit()
dataset_collection_binding_id = dataset_collection_binding.id
dataset.collection_binding_id = dataset_collection_binding_id
dataset.embedding_model = knowledge_configuration.embedding_model
dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
elif knowledge_configuration.indexing_technique == "economy":
dataset.keyword_number = knowledge_configuration.keyword_number
dataset.pipeline_id = pipeline.id
self._session.add(dataset)
self._session.commit()
dataset_id = dataset.id
if not dataset_id:
raise ValueError("DSL is not valid, please check the Knowledge Index node.")
return RagPipelineImportInfo(
id=import_id,
status=status,
pipeline_id=pipeline.id,
dataset_id=dataset_id,
imported_dsl_version=imported_version,
)
except yaml.YAMLError as e:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error=f"Invalid YAML format: {str(e)}",
)
except Exception as e:
logger.exception("Failed to import app")
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error=str(e),
)
def confirm_import(self, *, import_id: str, account: Account) -> RagPipelineImportInfo:
"""
Confirm an import that requires confirmation
"""
redis_key = f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}"
pending_data = redis_client.get(redis_key)
if not pending_data:
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Import information expired or does not exist",
)
try:
if not isinstance(pending_data, str | bytes):
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error="Invalid import information",
)
pending_data = RagPipelinePendingData.model_validate_json(pending_data)
data = yaml.safe_load(pending_data.yaml_content)
pipeline = None
if pending_data.pipeline_id:
stmt = select(Pipeline).where(
Pipeline.id == pending_data.pipeline_id,
Pipeline.tenant_id == account.current_tenant_id,
)
pipeline = self._session.scalar(stmt)
# Create or update app
pipeline = self._create_or_update_pipeline(
pipeline=pipeline,
data=data,
account=account,
)
dataset = pipeline.retrieve_dataset(session=self._session)
# create dataset
name = pipeline.name
description = pipeline.description
icon_type = data.get("rag_pipeline", {}).get("icon_type")
icon = data.get("rag_pipeline", {}).get("icon")
icon_background = data.get("rag_pipeline", {}).get("icon_background")
icon_url = data.get("rag_pipeline", {}).get("icon_url")
workflow = data.get("workflow", {})
graph = workflow.get("graph", {})
nodes = graph.get("nodes", [])
dataset_id = None
for node in nodes:
if node.get("data", {}).get("type") == "knowledge-index":
knowledge_configuration = KnowledgeConfiguration(**node.get("data", {}))
if not dataset:
dataset = Dataset(
tenant_id=account.current_tenant_id,
name=name,
description=description,
icon_info={
"icon_type": icon_type,
"icon": icon,
"icon_background": icon_background,
"icon_url": icon_url,
},
indexing_technique=knowledge_configuration.indexing_technique,
created_by=account.id,
retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
runtime_mode="rag_pipeline",
chunk_structure=knowledge_configuration.chunk_structure,
)
else:
dataset.indexing_technique = knowledge_configuration.indexing_technique
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
dataset.runtime_mode = "rag_pipeline"
dataset.chunk_structure = knowledge_configuration.chunk_structure
if knowledge_configuration.indexing_technique == "high_quality":
dataset_collection_binding = (
self._session.query(DatasetCollectionBinding)
.where(
DatasetCollectionBinding.provider_name
== knowledge_configuration.embedding_model_provider,
DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
DatasetCollectionBinding.type == "dataset",
)
.order_by(DatasetCollectionBinding.created_at)
.first()
)
if not dataset_collection_binding:
dataset_collection_binding = DatasetCollectionBinding(
provider_name=knowledge_configuration.embedding_model_provider,
model_name=knowledge_configuration.embedding_model,
collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
type="dataset",
)
self._session.add(dataset_collection_binding)
self._session.commit()
dataset_collection_binding_id = dataset_collection_binding.id
dataset.collection_binding_id = dataset_collection_binding_id
dataset.embedding_model = knowledge_configuration.embedding_model
dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
elif knowledge_configuration.indexing_technique == "economy":
dataset.keyword_number = knowledge_configuration.keyword_number
dataset.pipeline_id = pipeline.id
self._session.add(dataset)
self._session.commit()
dataset_id = dataset.id
if not dataset_id:
raise ValueError("DSL is not valid, please check the Knowledge Index node.")
# Delete import info from Redis
redis_client.delete(redis_key)
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.COMPLETED,
pipeline_id=pipeline.id,
dataset_id=dataset_id,
current_dsl_version=CURRENT_DSL_VERSION,
imported_dsl_version=data.get("version", "0.1.0"),
)
except Exception as e:
logger.exception("Error confirming import")
return RagPipelineImportInfo(
id=import_id,
status=ImportStatus.FAILED,
error=str(e),
)
def check_dependencies(
self,
*,
pipeline: Pipeline,
) -> CheckDependenciesResult:
"""Check dependencies"""
# Get dependencies from Redis
redis_key = f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}"
dependencies = redis_client.get(redis_key)
if not dependencies:
return CheckDependenciesResult()
# Extract dependencies
dependencies = CheckDependenciesPendingData.model_validate_json(dependencies)
# Get leaked dependencies
leaked_dependencies = DependenciesAnalysisService.get_leaked_dependencies(
tenant_id=pipeline.tenant_id, dependencies=dependencies.dependencies
)
return CheckDependenciesResult(
leaked_dependencies=leaked_dependencies,
)
def _create_or_update_pipeline(
self,
*,
pipeline: Pipeline | None,
data: dict,
account: Account,
dependencies: list[PluginDependency] | None = None,
) -> Pipeline:
"""Create a new app or update an existing one."""
if not account.current_tenant_id:
raise ValueError("Tenant id is required")
pipeline_data = data.get("rag_pipeline", {})
# Initialize pipeline based on mode
workflow_data = data.get("workflow")
if not workflow_data or not isinstance(workflow_data, dict):
raise ValueError("Missing workflow data for rag pipeline")
environment_variables_list = workflow_data.get("environment_variables", [])
environment_variables = [
variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
]
conversation_variables_list = workflow_data.get("conversation_variables", [])
conversation_variables = [
variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
]
rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
graph = workflow_data.get("graph", {})
for node in graph.get("nodes", []):
if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL.value:
dataset_ids = node["data"].get("dataset_ids", [])
node["data"]["dataset_ids"] = [
decrypted_id
for dataset_id in dataset_ids
if (
decrypted_id := self.decrypt_dataset_id(
encrypted_data=dataset_id,
tenant_id=account.current_tenant_id,
)
)
]
if pipeline:
# Update existing pipeline
pipeline.name = pipeline_data.get("name", pipeline.name)
pipeline.description = pipeline_data.get("description", pipeline.description)
pipeline.updated_by = account.id
else:
if account.current_tenant_id is None:
raise ValueError("Current tenant is not set")
# Create new app
pipeline = Pipeline()
pipeline.id = str(uuid4())
pipeline.tenant_id = account.current_tenant_id
pipeline.name = pipeline_data.get("name", "")
pipeline.description = pipeline_data.get("description", "")
pipeline.created_by = account.id
pipeline.updated_by = account.id
self._session.add(pipeline)
self._session.commit()
# save dependencies
if dependencies:
redis_client.setex(
f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}",
IMPORT_INFO_REDIS_EXPIRY,
CheckDependenciesPendingData(pipeline_id=pipeline.id, dependencies=dependencies).model_dump_json(),
)
workflow = (
self._session.query(Workflow)
.where(
Workflow.tenant_id == pipeline.tenant_id,
Workflow.app_id == pipeline.id,
Workflow.version == "draft",
)
.first()
)
# create draft workflow if not found
if not workflow:
workflow = Workflow(
tenant_id=pipeline.tenant_id,
app_id=pipeline.id,
features="{}",
type=WorkflowType.RAG_PIPELINE.value,
version="draft",
graph=json.dumps(graph),
created_by=account.id,
environment_variables=environment_variables,
conversation_variables=conversation_variables,
rag_pipeline_variables=rag_pipeline_variables_list,
)
self._session.add(workflow)
self._session.flush()
pipeline.workflow_id = workflow.id
else:
workflow.graph = json.dumps(graph)
workflow.updated_by = account.id
workflow.updated_at = datetime.now(UTC).replace(tzinfo=None)
workflow.environment_variables = environment_variables
workflow.conversation_variables = conversation_variables
workflow.rag_pipeline_variables = rag_pipeline_variables_list
# commit db session changes
self._session.commit()
return pipeline
def export_rag_pipeline_dsl(self, pipeline: Pipeline, include_secret: bool = False) -> str:
"""
Export pipeline
:param pipeline: Pipeline instance
:param include_secret: Whether include secret variable
:return:
"""
dataset = pipeline.retrieve_dataset(session=self._session)
if not dataset:
raise ValueError("Missing dataset for rag pipeline")
icon_info = dataset.icon_info
export_data = {
"version": CURRENT_DSL_VERSION,
"kind": "rag_pipeline",
"rag_pipeline": {
"name": dataset.name,
"icon": icon_info.get("icon", "📙") if icon_info else "📙",
"icon_type": icon_info.get("icon_type", "emoji") if icon_info else "emoji",
"icon_background": icon_info.get("icon_background", "#FFEAD5") if icon_info else "#FFEAD5",
"icon_url": icon_info.get("icon_url") if icon_info else None,
"description": pipeline.description,
},
}
self._append_workflow_export_data(export_data=export_data, pipeline=pipeline, include_secret=include_secret)
return yaml.dump(export_data, allow_unicode=True) # type: ignore
def _append_workflow_export_data(self, *, export_data: dict, pipeline: Pipeline, include_secret: bool) -> None:
"""
Append workflow export data
:param export_data: export data
:param pipeline: Pipeline instance
"""
workflow = (
self._session.query(Workflow)
.where(
Workflow.tenant_id == pipeline.tenant_id,
Workflow.app_id == pipeline.id,
Workflow.version == "draft",
)
.first()
)
if not workflow:
raise ValueError("Missing draft workflow configuration, please check.")
workflow_dict = workflow.to_dict(include_secret=include_secret)
for node in workflow_dict.get("graph", {}).get("nodes", []):
if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL.value:
dataset_ids = node["data"].get("dataset_ids", [])
node["data"]["dataset_ids"] = [
self.encrypt_dataset_id(dataset_id=dataset_id, tenant_id=pipeline.tenant_id)
for dataset_id in dataset_ids
]
export_data["workflow"] = workflow_dict
dependencies = self._extract_dependencies_from_workflow(workflow)
export_data["dependencies"] = [
jsonable_encoder(d.model_dump())
for d in DependenciesAnalysisService.generate_dependencies(
tenant_id=pipeline.tenant_id, dependencies=dependencies
)
]
def _extract_dependencies_from_workflow(self, workflow: Workflow) -> list[str]:
"""
Extract dependencies from workflow
:param workflow: Workflow instance
:return: dependencies list format like ["langgenius/google"]
"""
graph = workflow.graph_dict
dependencies = self._extract_dependencies_from_workflow_graph(graph)
return dependencies
def _extract_dependencies_from_workflow_graph(self, graph: Mapping) -> list[str]:
"""
Extract dependencies from workflow graph
:param graph: Workflow graph
:return: dependencies list format like ["langgenius/google"]
"""
dependencies = []
for node in graph.get("nodes", []):
try:
typ = node.get("data", {}).get("type")
match typ:
case NodeType.TOOL.value:
tool_entity = ToolNodeData(**node["data"])
dependencies.append(
DependenciesAnalysisService.analyze_tool_dependency(tool_entity.provider_id),
)
case NodeType.DATASOURCE.value:
datasource_entity = DatasourceNodeData(**node["data"])
if datasource_entity.provider_type != "local_file":
dependencies.append(datasource_entity.plugin_id)
case NodeType.LLM.value:
llm_entity = LLMNodeData(**node["data"])
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(llm_entity.model.provider),
)
case NodeType.QUESTION_CLASSIFIER.value:
question_classifier_entity = QuestionClassifierNodeData(**node["data"])
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
question_classifier_entity.model.provider
),
)
case NodeType.PARAMETER_EXTRACTOR.value:
parameter_extractor_entity = ParameterExtractorNodeData(**node["data"])
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
parameter_extractor_entity.model.provider
),
)
case NodeType.KNOWLEDGE_INDEX.value:
knowledge_index_entity = KnowledgeConfiguration(**node["data"])
if knowledge_index_entity.indexing_technique == "high_quality":
if knowledge_index_entity.embedding_model_provider:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
knowledge_index_entity.embedding_model_provider
),
)
if knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model":
if knowledge_index_entity.retrieval_model.reranking_enable:
if (
knowledge_index_entity.retrieval_model.reranking_model
and knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model"
):
if knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name
),
)
case NodeType.KNOWLEDGE_RETRIEVAL.value:
knowledge_retrieval_entity = KnowledgeRetrievalNodeData(**node["data"])
if knowledge_retrieval_entity.retrieval_mode == "multiple":
if knowledge_retrieval_entity.multiple_retrieval_config:
if (
knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
== "reranking_model"
):
if knowledge_retrieval_entity.multiple_retrieval_config.reranking_model:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
knowledge_retrieval_entity.multiple_retrieval_config.reranking_model.provider
),
)
elif (
knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
== "weighted_score"
):
if knowledge_retrieval_entity.multiple_retrieval_config.weights:
vector_setting = (
knowledge_retrieval_entity.multiple_retrieval_config.weights.vector_setting
)
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
vector_setting.embedding_provider_name
),
)
elif knowledge_retrieval_entity.retrieval_mode == "single":
model_config = knowledge_retrieval_entity.single_retrieval_config
if model_config:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
model_config.model.provider
),
)
case _:
# TODO: Handle default case or unknown node types
pass
except Exception as e:
logger.exception("Error extracting node dependency", exc_info=e)
return dependencies
@classmethod
def _extract_dependencies_from_model_config(cls, model_config: Mapping) -> list[str]:
"""
Extract dependencies from model config
:param model_config: model config dict
:return: dependencies list format like ["langgenius/google"]
"""
dependencies = []
try:
# completion model
model_dict = model_config.get("model", {})
if model_dict:
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(model_dict.get("provider", ""))
)
# reranking model
dataset_configs = model_config.get("dataset_configs", {})
if dataset_configs:
for dataset_config in dataset_configs.get("datasets", {}).get("datasets", []):
if dataset_config.get("reranking_model"):
dependencies.append(
DependenciesAnalysisService.analyze_model_provider_dependency(
dataset_config.get("reranking_model", {})
.get("reranking_provider_name", {})
.get("provider")
)
)
# tools
agent_configs = model_config.get("agent_mode", {})
if agent_configs:
for agent_config in agent_configs.get("tools", []):
dependencies.append(
DependenciesAnalysisService.analyze_tool_dependency(agent_config.get("provider_id"))
)
except Exception as e:
logger.exception("Error extracting model config dependency", exc_info=e)
return dependencies
@classmethod
def get_leaked_dependencies(cls, tenant_id: str, dsl_dependencies: list[dict]) -> list[PluginDependency]:
"""
Returns the leaked dependencies in current workspace
"""
dependencies = [PluginDependency(**dep) for dep in dsl_dependencies]
if not dependencies:
return []
return DependenciesAnalysisService.get_leaked_dependencies(tenant_id=tenant_id, dependencies=dependencies)
def _generate_aes_key(self, tenant_id: str) -> bytes:
"""Generate AES key based on tenant_id"""
return hashlib.sha256(tenant_id.encode()).digest()
def encrypt_dataset_id(self, dataset_id: str, tenant_id: str) -> str:
"""Encrypt dataset_id using AES-CBC mode"""
key = self._generate_aes_key(tenant_id)
iv = key[:16]
cipher = AES.new(key, AES.MODE_CBC, iv)
ct_bytes = cipher.encrypt(pad(dataset_id.encode(), AES.block_size))
return base64.b64encode(ct_bytes).decode()
def decrypt_dataset_id(self, encrypted_data: str, tenant_id: str) -> str | None:
"""AES decryption"""
try:
key = self._generate_aes_key(tenant_id)
iv = key[:16]
cipher = AES.new(key, AES.MODE_CBC, iv)
pt = unpad(cipher.decrypt(base64.b64decode(encrypted_data)), AES.block_size)
return pt.decode()
except Exception:
return None
def create_rag_pipeline_dataset(
self,
tenant_id: str,
rag_pipeline_dataset_create_entity: RagPipelineDatasetCreateEntity,
):
if rag_pipeline_dataset_create_entity.name:
# check if dataset name already exists
if (
self._session.query(Dataset)
.filter_by(name=rag_pipeline_dataset_create_entity.name, tenant_id=tenant_id)
.first()
):
raise ValueError(f"Dataset with name {rag_pipeline_dataset_create_entity.name} already exists.")
else:
# generate a random name as Untitled 1 2 3 ...
datasets = self._session.query(Dataset).filter_by(tenant_id=tenant_id).all()
names = [dataset.name for dataset in datasets]
rag_pipeline_dataset_create_entity.name = generate_incremental_name(
names,
"Untitled",
)
account = cast(Account, current_user)
rag_pipeline_import_info: RagPipelineImportInfo = self.import_rag_pipeline(
account=account,
import_mode=ImportMode.YAML_CONTENT.value,
yaml_content=rag_pipeline_dataset_create_entity.yaml_content,
dataset=None,
dataset_name=rag_pipeline_dataset_create_entity.name,
icon_info=rag_pipeline_dataset_create_entity.icon_info,
)
return {
"id": rag_pipeline_import_info.id,
"dataset_id": rag_pipeline_import_info.dataset_id,
"pipeline_id": rag_pipeline_import_info.pipeline_id,
"status": rag_pipeline_import_info.status,
"imported_dsl_version": rag_pipeline_import_info.imported_dsl_version,
"current_dsl_version": rag_pipeline_import_info.current_dsl_version,
"error": rag_pipeline_import_info.error,
}

View file

@ -0,0 +1,23 @@
from core.plugin.entities.plugin_daemon import PluginDatasourceProviderEntity
from core.plugin.impl.datasource import PluginDatasourceManager
from services.datasource_provider_service import DatasourceProviderService
class RagPipelineManageService:
@staticmethod
def list_rag_pipeline_datasources(tenant_id: str) -> list[PluginDatasourceProviderEntity]:
"""
list rag pipeline datasources
"""
# get all builtin providers
manager = PluginDatasourceManager()
datasources = manager.fetch_datasource_providers(tenant_id)
for datasource in datasources:
datasource_provider_service = DatasourceProviderService()
credentials = datasource_provider_service.get_datasource_credentials(
tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
)
if credentials:
datasource.is_authorized = True
return datasources

View file

@ -0,0 +1,383 @@
import json
from datetime import UTC, datetime
from pathlib import Path
from uuid import uuid4
import yaml
from flask_login import current_user
from constants import DOCUMENT_EXTENSIONS
from core.plugin.impl.plugin import PluginInstaller
from extensions.ext_database import db
from factories import variable_factory
from models.dataset import Dataset, Document, DocumentPipelineExecutionLog, Pipeline
from models.model import UploadFile
from models.workflow import Workflow, WorkflowType
from services.entities.knowledge_entities.rag_pipeline_entities import KnowledgeConfiguration, RetrievalSetting
from services.plugin.plugin_migration import PluginMigration
from services.plugin.plugin_service import PluginService
class RagPipelineTransformService:
def transform_dataset(self, dataset_id: str):
dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
if not dataset:
raise ValueError("Dataset not found")
if dataset.pipeline_id and dataset.runtime_mode == "rag_pipeline":
return {
"pipeline_id": dataset.pipeline_id,
"dataset_id": dataset_id,
"status": "success",
}
if dataset.provider != "vendor":
raise ValueError("External dataset is not supported")
datasource_type = dataset.data_source_type
indexing_technique = dataset.indexing_technique
if not datasource_type and not indexing_technique:
return self._transfrom_to_empty_pipeline(dataset)
doc_form = dataset.doc_form
if not doc_form:
return self._transfrom_to_empty_pipeline(dataset)
retrieval_model = dataset.retrieval_model
pipeline_yaml = self._get_transform_yaml(doc_form, datasource_type, indexing_technique)
# deal dependencies
self._deal_dependencies(pipeline_yaml, dataset.tenant_id)
# Extract app data
workflow_data = pipeline_yaml.get("workflow")
if not workflow_data:
raise ValueError("Missing workflow data for rag pipeline")
graph = workflow_data.get("graph", {})
nodes = graph.get("nodes", [])
new_nodes = []
for node in nodes:
if (
node.get("data", {}).get("type") == "datasource"
and node.get("data", {}).get("provider_type") == "local_file"
):
node = self._deal_file_extensions(node)
if node.get("data", {}).get("type") == "knowledge-index":
node = self._deal_knowledge_index(dataset, doc_form, indexing_technique, retrieval_model, node)
new_nodes.append(node)
if new_nodes:
graph["nodes"] = new_nodes
workflow_data["graph"] = graph
pipeline_yaml["workflow"] = workflow_data
# create pipeline
pipeline = self._create_pipeline(pipeline_yaml)
# save chunk structure to dataset
if doc_form == "hierarchical_model":
dataset.chunk_structure = "hierarchical_model"
elif doc_form == "text_model":
dataset.chunk_structure = "text_model"
else:
raise ValueError("Unsupported doc form")
dataset.runtime_mode = "rag_pipeline"
dataset.pipeline_id = pipeline.id
# deal document data
self._deal_document_data(dataset)
db.session.commit()
return {
"pipeline_id": pipeline.id,
"dataset_id": dataset_id,
"status": "success",
}
def _get_transform_yaml(self, doc_form: str, datasource_type: str, indexing_technique: str | None):
pipeline_yaml = {}
if doc_form == "text_model":
match datasource_type:
case "upload_file":
if indexing_technique == "high_quality":
# get graph from transform.file-general-high-quality.yml
with open(f"{Path(__file__).parent}/transform/file-general-high-quality.yml") as f:
pipeline_yaml = yaml.safe_load(f)
if indexing_technique == "economy":
# get graph from transform.file-general-economy.yml
with open(f"{Path(__file__).parent}/transform/file-general-economy.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case "notion_import":
if indexing_technique == "high_quality":
# get graph from transform.notion-general-high-quality.yml
with open(f"{Path(__file__).parent}/transform/notion-general-high-quality.yml") as f:
pipeline_yaml = yaml.safe_load(f)
if indexing_technique == "economy":
# get graph from transform.notion-general-economy.yml
with open(f"{Path(__file__).parent}/transform/notion-general-economy.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case "website_crawl":
if indexing_technique == "high_quality":
# get graph from transform.website-crawl-general-high-quality.yml
with open(f"{Path(__file__).parent}/transform/website-crawl-general-high-quality.yml") as f:
pipeline_yaml = yaml.safe_load(f)
if indexing_technique == "economy":
# get graph from transform.website-crawl-general-economy.yml
with open(f"{Path(__file__).parent}/transform/website-crawl-general-economy.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case _:
raise ValueError("Unsupported datasource type")
elif doc_form == "hierarchical_model":
match datasource_type:
case "upload_file":
# get graph from transform.file-parentchild.yml
with open(f"{Path(__file__).parent}/transform/file-parentchild.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case "notion_import":
# get graph from transform.notion-parentchild.yml
with open(f"{Path(__file__).parent}/transform/notion-parentchild.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case "website_crawl":
# get graph from transform.website-crawl-parentchild.yml
with open(f"{Path(__file__).parent}/transform/website-crawl-parentchild.yml") as f:
pipeline_yaml = yaml.safe_load(f)
case _:
raise ValueError("Unsupported datasource type")
else:
raise ValueError("Unsupported doc form")
return pipeline_yaml
def _deal_file_extensions(self, node: dict):
file_extensions = node.get("data", {}).get("fileExtensions", [])
if not file_extensions:
return node
file_extensions = [file_extension.lower() for file_extension in file_extensions]
node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS
return node
def _deal_knowledge_index(
self, dataset: Dataset, doc_form: str, indexing_technique: str | None, retrieval_model: dict, node: dict
):
knowledge_configuration_dict = node.get("data", {})
knowledge_configuration = KnowledgeConfiguration(**knowledge_configuration_dict)
if indexing_technique == "high_quality":
knowledge_configuration.embedding_model = dataset.embedding_model
knowledge_configuration.embedding_model_provider = dataset.embedding_model_provider
if retrieval_model:
retrieval_setting = RetrievalSetting(**retrieval_model)
if indexing_technique == "economy":
retrieval_setting.search_method = "keyword_search"
knowledge_configuration.retrieval_model = retrieval_setting
else:
dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
knowledge_configuration_dict.update(knowledge_configuration.model_dump())
node["data"] = knowledge_configuration_dict
return node
def _create_pipeline(
self,
data: dict,
) -> Pipeline:
"""Create a new app or update an existing one."""
pipeline_data = data.get("rag_pipeline", {})
# Initialize pipeline based on mode
workflow_data = data.get("workflow")
if not workflow_data or not isinstance(workflow_data, dict):
raise ValueError("Missing workflow data for rag pipeline")
environment_variables_list = workflow_data.get("environment_variables", [])
environment_variables = [
variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
]
conversation_variables_list = workflow_data.get("conversation_variables", [])
conversation_variables = [
variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
]
rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
graph = workflow_data.get("graph", {})
# Create new app
pipeline = Pipeline()
pipeline.id = str(uuid4())
pipeline.tenant_id = current_user.current_tenant_id
pipeline.name = pipeline_data.get("name", "")
pipeline.description = pipeline_data.get("description", "")
pipeline.created_by = current_user.id
pipeline.updated_by = current_user.id
pipeline.is_published = True
pipeline.is_public = True
db.session.add(pipeline)
db.session.flush()
# create draft workflow
draft_workflow = Workflow(
tenant_id=pipeline.tenant_id,
app_id=pipeline.id,
features="{}",
type=WorkflowType.RAG_PIPELINE.value,
version="draft",
graph=json.dumps(graph),
created_by=current_user.id,
environment_variables=environment_variables,
conversation_variables=conversation_variables,
rag_pipeline_variables=rag_pipeline_variables_list,
)
published_workflow = Workflow(
tenant_id=pipeline.tenant_id,
app_id=pipeline.id,
features="{}",
type=WorkflowType.RAG_PIPELINE.value,
version=str(datetime.now(UTC).replace(tzinfo=None)),
graph=json.dumps(graph),
created_by=current_user.id,
environment_variables=environment_variables,
conversation_variables=conversation_variables,
rag_pipeline_variables=rag_pipeline_variables_list,
)
db.session.add(draft_workflow)
db.session.add(published_workflow)
db.session.flush()
pipeline.workflow_id = published_workflow.id
db.session.add(pipeline)
return pipeline
def _deal_dependencies(self, pipeline_yaml: dict, tenant_id: str):
installer_manager = PluginInstaller()
installed_plugins = installer_manager.list_plugins(tenant_id)
plugin_migration = PluginMigration()
installed_plugins_ids = [plugin.plugin_id for plugin in installed_plugins]
dependencies = pipeline_yaml.get("dependencies", [])
need_install_plugin_unique_identifiers = []
for dependency in dependencies:
if dependency.get("type") == "marketplace":
plugin_unique_identifier = dependency.get("value", {}).get("plugin_unique_identifier")
plugin_id = plugin_unique_identifier.split(":")[0]
if plugin_id not in installed_plugins_ids:
plugin_unique_identifier = plugin_migration._fetch_plugin_unique_identifier(plugin_id) # type: ignore
if plugin_unique_identifier:
need_install_plugin_unique_identifiers.append(plugin_unique_identifier)
if need_install_plugin_unique_identifiers:
print(need_install_plugin_unique_identifiers)
PluginService.install_from_marketplace_pkg(tenant_id, need_install_plugin_unique_identifiers)
def _transfrom_to_empty_pipeline(self, dataset: Dataset):
pipeline = Pipeline(
tenant_id=dataset.tenant_id,
name=dataset.name,
description=dataset.description,
created_by=current_user.id,
)
db.session.add(pipeline)
db.session.flush()
dataset.pipeline_id = pipeline.id
dataset.runtime_mode = "rag_pipeline"
dataset.updated_by = current_user.id
dataset.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.add(dataset)
db.session.commit()
return {
"pipeline_id": pipeline.id,
"dataset_id": dataset.id,
"status": "success",
}
def _deal_document_data(self, dataset: Dataset):
file_node_id = "1752479895761"
notion_node_id = "1752489759475"
jina_node_id = "1752491761974"
firecrawl_node_id = "1752565402678"
documents = db.session.query(Document).where(Document.dataset_id == dataset.id).all()
for document in documents:
data_source_info_dict = document.data_source_info_dict
if not data_source_info_dict:
continue
if document.data_source_type == "upload_file":
document.data_source_type = "local_file"
file_id = data_source_info_dict.get("upload_file_id")
if file_id:
file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
if file:
data_source_info = json.dumps(
{
"real_file_id": file_id,
"name": file.name,
"size": file.size,
"extension": file.extension,
"mime_type": file.mime_type,
"url": "",
"transfer_method": "local_file",
}
)
document.data_source_info = data_source_info
document_pipeline_execution_log = DocumentPipelineExecutionLog(
document_id=document.id,
pipeline_id=dataset.pipeline_id,
datasource_type="local_file",
datasource_info=data_source_info,
input_data={},
created_by=document.created_by,
created_at=document.created_at,
datasource_node_id=file_node_id,
)
db.session.add(document)
db.session.add(document_pipeline_execution_log)
elif document.data_source_type == "notion_import":
document.data_source_type = "online_document"
data_source_info = json.dumps(
{
"workspace_id": data_source_info_dict.get("notion_workspace_id"),
"page": {
"page_id": data_source_info_dict.get("notion_page_id"),
"page_name": document.name,
"page_icon": data_source_info_dict.get("notion_page_icon"),
"type": data_source_info_dict.get("type"),
"last_edited_time": data_source_info_dict.get("last_edited_time"),
"parent_id": None,
},
}
)
document.data_source_info = data_source_info
document_pipeline_execution_log = DocumentPipelineExecutionLog(
document_id=document.id,
pipeline_id=dataset.pipeline_id,
datasource_type="online_document",
datasource_info=data_source_info,
input_data={},
created_by=document.created_by,
created_at=document.created_at,
datasource_node_id=notion_node_id,
)
db.session.add(document)
db.session.add(document_pipeline_execution_log)
elif document.data_source_type == "website_crawl":
document.data_source_type = "website_crawl"
data_source_info = json.dumps(
{
"source_url": data_source_info_dict.get("url"),
"content": "",
"title": document.name,
"description": "",
}
)
document.data_source_info = data_source_info
if data_source_info_dict.get("provider") == "firecrawl":
datasource_node_id = firecrawl_node_id
elif data_source_info_dict.get("provider") == "jinareader":
datasource_node_id = jina_node_id
else:
continue
document_pipeline_execution_log = DocumentPipelineExecutionLog(
document_id=document.id,
pipeline_id=dataset.pipeline_id,
datasource_type="website_crawl",
datasource_info=data_source_info,
input_data={},
created_by=document.created_by,
created_at=document.created_at,
datasource_node_id=datasource_node_id,
)
db.session.add(document)
db.session.add(document_pipeline_execution_log)

View file

@ -0,0 +1,709 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: file-general-economy
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: if-else
id: 1752479895761-source-1752481129417-target
source: '1752479895761'
sourceHandle: source
target: '1752481129417'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: tool
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
source: '1752481129417'
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
target: '1752480460682'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: document-extractor
id: 1752481129417-false-1752481112180-target
source: '1752481129417'
sourceHandle: 'false'
target: '1752481112180'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: variable-aggregator
id: 1752480460682-source-1752482022496-target
source: '1752480460682'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: document-extractor
targetType: variable-aggregator
id: 1752481112180-source-1752482022496-target
source: '1752481112180'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752482022496-source-1752482151668-target
source: '1752482022496'
sourceHandle: source
target: '1752482151668'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752482151668-source-1752477924228-target
source: '1752482151668'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752482151668'
- result
indexing_technique: economy
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: keyword_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: true
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1076.4656678451215
y: 281.3910724383104
positionAbsolute:
x: 1076.4656678451215
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: File
datasource_name: upload-file
datasource_parameters: {}
fileExtensions:
- txt
- markdown
- mdx
- pdf
- html
- xlsx
- xls
- vtt
- properties
- doc
- docx
- csv
- eml
- msg
- pptx
- xml
- epub
- ppt
- md
plugin_id: langgenius/file
provider_name: file
provider_type: local_file
selected: false
title: File
type: datasource
height: 52
id: '1752479895761'
position:
x: -839.8603427660498
y: 251.3910724383104
positionAbsolute:
x: -839.8603427660498
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
documents:
description: the documents extracted from the file
items:
type: object
type: array
images:
description: The images extracted from the file
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
jpeg)
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
jpg, jpeg)
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
label:
en_US: file
ja_JP: ファイル
pt_BR: arquivo
zh_Hans: file
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
png, jpg, jpeg)
max: null
min: null
name: file
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: file
params:
file: ''
provider_id: langgenius/dify_extractor/dify_extractor
provider_name: langgenius/dify_extractor/dify_extractor
provider_type: builtin
selected: false
title: Dify Extractor
tool_configurations: {}
tool_description: Dify Extractor
tool_label: Dify Extractor
tool_name: dify_extractor
tool_parameters:
file:
type: variable
value:
- '1752479895761'
- file
type: tool
height: 52
id: '1752480460682'
position:
x: -108.28652292656551
y: 281.3910724383104
positionAbsolute:
x: -108.28652292656551
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_array_file: false
selected: false
title: 文档提取器
type: document-extractor
variable_selector:
- '1752479895761'
- file
height: 90
id: '1752481112180'
position:
x: -108.28652292656551
y: 390.6576481692478
positionAbsolute:
x: -108.28652292656551
y: 390.6576481692478
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
cases:
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
conditions:
- comparison_operator: is
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
value: .xlsx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
value: .xls
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
value: .md
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
value: .markdown
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
value: .mdx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
value: .html
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
value: .htm
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
value: .docx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
value: .csv
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
value: .txt
varType: file
variable_selector:
- '1752479895761'
- file
- extension
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
logical_operator: or
selected: false
title: IF/ELSE
type: if-else
height: 358
id: '1752481129417'
position:
x: -489.57009543377865
y: 251.3910724383104
positionAbsolute:
x: -489.57009543377865
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
advanced_settings:
group_enabled: false
groups:
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
group_name: Group1
output_type: string
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
height: 129
id: '1752482022496'
position:
x: 319.441649575055
y: 281.3910724383104
positionAbsolute:
x: 319.441649575055
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos blocos.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: DDelimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: O comprimento de sobreposição dos fragmentos
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Comprimento de sobreposição do bloco
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Excluir todos os URLs e endereços de e-mail
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Excluir todos os URLs e endereços de e-mail
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752482022496.output#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752482151668'
position:
x: 693.5300771507484
y: 281.3910724383104
positionAbsolute:
x: 693.5300771507484
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: 701.4999626224237
y: 128.33739021504016
zoom: 0.48941689643726966
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Chunk overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -0,0 +1,709 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: '#FFF4ED'
icon_type: emoji
name: file-general-high-quality
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: if-else
id: 1752479895761-source-1752481129417-target
source: '1752479895761'
sourceHandle: source
target: '1752481129417'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: tool
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
source: '1752481129417'
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
target: '1752480460682'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: document-extractor
id: 1752481129417-false-1752481112180-target
source: '1752481129417'
sourceHandle: 'false'
target: '1752481112180'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: variable-aggregator
id: 1752480460682-source-1752482022496-target
source: '1752480460682'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: document-extractor
targetType: variable-aggregator
id: 1752481112180-source-1752482022496-target
source: '1752481112180'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752482022496-source-1752482151668-target
source: '1752482022496'
sourceHandle: source
target: '1752482151668'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752482151668-source-1752477924228-target
source: '1752482151668'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752482151668'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1076.4656678451215
y: 281.3910724383104
positionAbsolute:
x: 1076.4656678451215
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: File
datasource_name: upload-file
datasource_parameters: {}
fileExtensions:
- txt
- markdown
- mdx
- pdf
- html
- xlsx
- xls
- vtt
- properties
- doc
- docx
- csv
- eml
- msg
- pptx
- xml
- epub
- ppt
- md
plugin_id: langgenius/file
provider_name: file
provider_type: local_file
selected: false
title: File
type: datasource
height: 52
id: '1752479895761'
position:
x: -839.8603427660498
y: 251.3910724383104
positionAbsolute:
x: -839.8603427660498
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
documents:
description: the documents extracted from the file
items:
type: object
type: array
images:
description: The images extracted from the file
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
jpeg)
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
jpg, jpeg)
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
label:
en_US: file
ja_JP: ファイル
pt_BR: arquivo
zh_Hans: file
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
png, jpg, jpeg)
max: null
min: null
name: file
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: file
params:
file: ''
provider_id: langgenius/dify_extractor/dify_extractor
provider_name: langgenius/dify_extractor/dify_extractor
provider_type: builtin
selected: false
title: Dify Extractor
tool_configurations: {}
tool_description: Dify Extractor
tool_label: Dify Extractor
tool_name: dify_extractor
tool_parameters:
file:
type: variable
value:
- '1752479895761'
- file
type: tool
height: 52
id: '1752480460682'
position:
x: -108.28652292656551
y: 281.3910724383104
positionAbsolute:
x: -108.28652292656551
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_array_file: false
selected: false
title: 文档提取器
type: document-extractor
variable_selector:
- '1752479895761'
- file
height: 90
id: '1752481112180'
position:
x: -108.28652292656551
y: 390.6576481692478
positionAbsolute:
x: -108.28652292656551
y: 390.6576481692478
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
cases:
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
conditions:
- comparison_operator: is
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
value: .xlsx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
value: .xls
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
value: .md
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
value: .markdown
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
value: .mdx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
value: .html
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
value: .htm
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
value: .docx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
value: .csv
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
value: .txt
varType: file
variable_selector:
- '1752479895761'
- file
- extension
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
logical_operator: or
selected: false
title: IF/ELSE
type: if-else
height: 358
id: '1752481129417'
position:
x: -489.57009543377865
y: 251.3910724383104
positionAbsolute:
x: -489.57009543377865
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
advanced_settings:
group_enabled: false
groups:
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
group_name: Group1
output_type: string
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
height: 129
id: '1752482022496'
position:
x: 319.441649575055
y: 281.3910724383104
positionAbsolute:
x: 319.441649575055
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752482022496.output#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752482151668'
position:
x: 693.5300771507484
y: 281.3910724383104
positionAbsolute:
x: 693.5300771507484
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: 701.4999626224237
y: 128.33739021504016
zoom: 0.48941689643726966
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Chunk overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -0,0 +1,814 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: '#FFF4ED'
icon_type: emoji
name: file-parentchild
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: if-else
id: 1752479895761-source-1752481129417-target
source: '1752479895761'
sourceHandle: source
target: '1752481129417'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: tool
id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
source: '1752481129417'
sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
target: '1752480460682'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: if-else
targetType: document-extractor
id: 1752481129417-false-1752481112180-target
source: '1752481129417'
sourceHandle: 'false'
target: '1752481112180'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: variable-aggregator
id: 1752480460682-source-1752482022496-target
source: '1752480460682'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: document-extractor
targetType: variable-aggregator
id: 1752481112180-source-1752482022496-target
source: '1752481112180'
sourceHandle: source
target: '1752482022496'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752482022496-source-1752575473519-target
source: '1752482022496'
sourceHandle: source
target: '1752575473519'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752575473519-source-1752477924228-target
source: '1752575473519'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: hierarchical_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752575473519'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 994.3774545394483
y: 281.3910724383104
positionAbsolute:
x: 994.3774545394483
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: File
datasource_name: upload-file
datasource_parameters: {}
fileExtensions:
- txt
- markdown
- mdx
- pdf
- html
- xlsx
- xls
- vtt
- properties
- doc
- docx
- csv
- eml
- msg
- pptx
- xml
- epub
- ppt
- md
plugin_id: langgenius/file
provider_name: file
provider_type: local_file
selected: false
title: File
type: datasource
height: 52
id: '1752479895761'
position:
x: -839.8603427660498
y: 251.3910724383104
positionAbsolute:
x: -839.8603427660498
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
documents:
description: the documents extracted from the file
items:
type: object
type: array
images:
description: The images extracted from the file
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
jpeg)
ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
jpg, jpeg)
zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
label:
en_US: file
ja_JP: ファイル
pt_BR: arquivo
zh_Hans: file
llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
png, jpg, jpeg)
max: null
min: null
name: file
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: file
params:
file: ''
provider_id: langgenius/dify_extractor/dify_extractor
provider_name: langgenius/dify_extractor/dify_extractor
provider_type: builtin
selected: false
title: Dify Extractor
tool_configurations: {}
tool_description: Dify Extractor
tool_label: Dify Extractor
tool_name: dify_extractor
tool_parameters:
file:
type: variable
value:
- '1752479895761'
- file
type: tool
height: 52
id: '1752480460682'
position:
x: -108.28652292656551
y: 281.3910724383104
positionAbsolute:
x: -108.28652292656551
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_array_file: false
selected: false
title: 文档提取器
type: document-extractor
variable_selector:
- '1752479895761'
- file
height: 90
id: '1752481112180'
position:
x: -108.28652292656551
y: 390.6576481692478
positionAbsolute:
x: -108.28652292656551
y: 390.6576481692478
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
cases:
- case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
conditions:
- comparison_operator: is
id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
value: .xlsx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
value: .xls
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
value: .md
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
value: .markdown
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
value: .mdx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
value: .html
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
value: .htm
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
value: .docx
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
value: .csv
varType: file
variable_selector:
- '1752479895761'
- file
- extension
- comparison_operator: is
id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
value: .txt
varType: file
variable_selector:
- '1752479895761'
- file
- extension
id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
logical_operator: or
selected: false
title: IF/ELSE
type: if-else
height: 358
id: '1752481129417'
position:
x: -512.2335487893622
y: 251.3910724383104
positionAbsolute:
x: -512.2335487893622
y: 251.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
advanced_settings:
group_enabled: false
groups:
- groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
group_name: Group1
output_type: string
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752481112180'
- text
- - '1752480460682'
- text
height: 129
id: '1752482022496'
position:
x: 319.441649575055
y: 281.3910724383104
positionAbsolute:
x: 319.441649575055
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: Parent child chunks result
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input text
ja_JP: 入力テキスト
pt_BR: Texto de entrada
zh_Hans: 输入文本
llm_description: The text you want to chunk.
max: null
min: null
name: input_text
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: 1024
form: llm
human_description:
en_US: Maximum length for chunking
ja_JP: チャンク分割の最大長
pt_BR: Comprimento máximo para divisão
zh_Hans: 用于分块的最大长度
label:
en_US: Maximum Length
ja_JP: 最大長
pt_BR: Comprimento Máximo
zh_Hans: 最大长度
llm_description: Maximum length allowed per chunk
max: null
min: null
name: max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '
'
form: llm
human_description:
en_US: Separator used for chunking
ja_JP: チャンク分割に使用する区切り文字
pt_BR: Separador usado para divisão
zh_Hans: 用于分块的分隔符
label:
en_US: Chunk Separator
ja_JP: チャンク区切り文字
pt_BR: Separador de Divisão
zh_Hans: 分块分隔符
llm_description: The separator used to split chunks
max: null
min: null
name: separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: 512
form: llm
human_description:
en_US: Maximum length for subchunking
ja_JP: サブチャンク分割の最大長
pt_BR: Comprimento máximo para subdivisão
zh_Hans: 用于子分块的最大长度
label:
en_US: Subchunk Maximum Length
ja_JP: サブチャンク最大長
pt_BR: Comprimento Máximo de Subdivisão
zh_Hans: 子分块最大长度
llm_description: Maximum length allowed per subchunk
max: null
min: null
name: subchunk_max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '. '
form: llm
human_description:
en_US: Separator used for subchunking
ja_JP: サブチャンク分割に使用する区切り文字
pt_BR: Separador usado para subdivisão
zh_Hans: 用于子分块的分隔符
label:
en_US: Subchunk Separator
ja_JP: サブチャンキング用セパレーター
pt_BR: Separador de Subdivisão
zh_Hans: 子分块分隔符
llm_description: The separator used to split subchunks
max: null
min: null
name: subchunk_separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: paragraph
form: llm
human_description:
en_US: Split text into paragraphs based on separator and maximum chunk
length, using split text as parent block or entire document as parent
block and directly retrieve.
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
máximo do bloco, usando o texto dividido como bloco pai ou documento
completo como bloco pai e diretamente recuperá-lo.
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
label:
en_US: Parent Mode
ja_JP: 親子モード
pt_BR: Modo Pai
zh_Hans: 父块模式
llm_description: Split text into paragraphs based on separator and maximum
chunk length, using split text as parent block or entire document as parent
block and directly retrieve.
max: null
min: null
name: parent_mode
options:
- icon: ''
label:
en_US: Paragraph
ja_JP: 段落
pt_BR: Parágrafo
zh_Hans: 段落
value: paragraph
- icon: ''
label:
en_US: Full Document
ja_JP: 全文
pt_BR: Documento Completo
zh_Hans: 全文
value: full_doc
placeholder: null
precision: null
required: true
scope: null
template: null
type: select
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove extra spaces in the text
ja_JP: テキスト内の余分なスペースを削除するかどうか
pt_BR: Se deve remover espaços extras no texto
zh_Hans: 是否移除文本中的多余空格
label:
en_US: Remove Extra Spaces
ja_JP: 余分なスペースを削除
pt_BR: Remover Espaços Extras
zh_Hans: 移除多余空格
llm_description: Whether to remove extra spaces in the text
max: null
min: null
name: remove_extra_spaces
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove URLs and emails in the text
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
pt_BR: Se deve remover URLs e e-mails no texto
zh_Hans: 是否移除文本中的URL和电子邮件地址
label:
en_US: Remove URLs and Emails
ja_JP: URLとメールアドレスを削除
pt_BR: Remover URLs e E-mails
zh_Hans: 移除URL和电子邮件地址
llm_description: Whether to remove URLs and emails in the text
max: null
min: null
name: remove_urls_emails
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
input_text: ''
max_length: ''
parent_mode: ''
remove_extra_spaces: ''
remove_urls_emails: ''
separator: ''
subchunk_max_length: ''
subchunk_separator: ''
provider_id: langgenius/parentchild_chunker/parentchild_chunker
provider_name: langgenius/parentchild_chunker/parentchild_chunker
provider_type: builtin
selected: false
title: Parent-child Chunker
tool_configurations: {}
tool_description: Parent-child Chunk Structure
tool_label: Parent-child Chunker
tool_name: parentchild_chunker
tool_parameters:
input_text:
type: mixed
value: '{{#1752482022496.output#}}'
max_length:
type: variable
value:
- rag
- shared
- max_chunk_length
parent_mode:
type: variable
value:
- rag
- shared
- parent_mode
remove_extra_spaces:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
remove_urls_emails:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
separator:
type: mixed
value: '{{#rag.shared.delimiter#}}'
subchunk_max_length:
type: variable
value:
- rag
- shared
- child_max_chunk_length
subchunk_separator:
type: mixed
value: '{{#rag.shared.child_delimiter#}}'
type: tool
height: 52
id: '1752575473519'
position:
x: 637.9241611063885
y: 281.3910724383104
positionAbsolute:
x: 637.9241611063885
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: 948.6766333808323
y: -102.06757184183238
zoom: 0.8375774577380971
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 256
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n
label: Child delimiter
max_length: 256
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: child_delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 512
label: Child max chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: child_max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: paragraph
label: Parent mode
max_length: 48
options:
- full_doc
- paragraph
placeholder: null
required: true
tooltips: null
type: select
unit: null
variable: parent_mode
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -0,0 +1,400 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: notion-general-economy
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752482151668-source-1752477924228-target
source: '1752482151668'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: tool
id: 1752489759475-source-1752482151668-target
source: '1752489759475'
sourceHandle: source
target: '1752482151668'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752482151668'
- result
indexing_technique: economy
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: keyword_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: true
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1444.5503479271906
y: 281.3910724383104
positionAbsolute:
x: 1444.5503479271906
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752489759475.content#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752482151668'
position:
x: 1063.6922916384628
y: 281.3910724383104
positionAbsolute:
x: 1063.6922916384628
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Notion数据源
datasource_name: notion_datasource
datasource_parameters: {}
plugin_id: langgenius/notion_datasource
provider_name: notion_datasource
provider_type: online_document
selected: false
title: Notion数据源
type: datasource
height: 52
id: '1752489759475'
position:
x: 736.9082104000458
y: 281.3910724383104
positionAbsolute:
x: 736.9082104000458
y: 281.3910724383104
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -838.569649323166
y: -168.94656489167426
zoom: 1.286925643857699
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Chunk overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -0,0 +1,400 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: '#FFF4ED'
icon_type: emoji
name: notion-general-high-quality
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752482151668-source-1752477924228-target
source: '1752482151668'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: tool
id: 1752489759475-source-1752482151668-target
source: '1752489759475'
sourceHandle: source
target: '1752482151668'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752482151668'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: true
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1444.5503479271906
y: 281.3910724383104
positionAbsolute:
x: 1444.5503479271906
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752489759475.content#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752482151668'
position:
x: 1063.6922916384628
y: 281.3910724383104
positionAbsolute:
x: 1063.6922916384628
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Notion数据源
datasource_name: notion_datasource
datasource_parameters: {}
plugin_id: langgenius/notion_datasource
provider_name: notion_datasource
provider_type: online_document
selected: false
title: Notion数据源
type: datasource
height: 52
id: '1752489759475'
position:
x: 736.9082104000458
y: 281.3910724383104
positionAbsolute:
x: 736.9082104000458
y: 281.3910724383104
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -838.569649323166
y: -168.94656489167426
zoom: 1.286925643857699
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Chunk overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -0,0 +1,506 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: notion-parentchild
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: tool
id: 1752489759475-source-1752490343805-target
source: '1752489759475'
sourceHandle: source
target: '1752490343805'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752490343805-source-1752477924228-target
source: '1752490343805'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: hierarchical_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752490343805'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 1486.2052698032674
y: 281.3910724383104
positionAbsolute:
x: 1486.2052698032674
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Notion数据源
datasource_name: notion_datasource
datasource_parameters: {}
plugin_id: langgenius/notion_datasource
provider_name: notion_datasource
provider_type: online_document
selected: false
title: Notion数据源
type: datasource
height: 52
id: '1752489759475'
position:
x: 736.9082104000458
y: 281.3910724383104
positionAbsolute:
x: 736.9082104000458
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: Parent child chunks result
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input text
ja_JP: 入力テキスト
pt_BR: Texto de entrada
zh_Hans: 输入文本
llm_description: The text you want to chunk.
max: null
min: null
name: input_text
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: 1024
form: llm
human_description:
en_US: Maximum length for chunking
ja_JP: チャンク分割の最大長
pt_BR: Comprimento máximo para divisão
zh_Hans: 用于分块的最大长度
label:
en_US: Maximum Length
ja_JP: 最大長
pt_BR: Comprimento Máximo
zh_Hans: 最大长度
llm_description: Maximum length allowed per chunk
max: null
min: null
name: max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '
'
form: llm
human_description:
en_US: Separator used for chunking
ja_JP: チャンク分割に使用する区切り文字
pt_BR: Separador usado para divisão
zh_Hans: 用于分块的分隔符
label:
en_US: Chunk Separator
ja_JP: チャンク区切り文字
pt_BR: Separador de Divisão
zh_Hans: 分块分隔符
llm_description: The separator used to split chunks
max: null
min: null
name: separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: 512
form: llm
human_description:
en_US: Maximum length for subchunking
ja_JP: サブチャンク分割の最大長
pt_BR: Comprimento máximo para subdivisão
zh_Hans: 用于子分块的最大长度
label:
en_US: Subchunk Maximum Length
ja_JP: サブチャンク最大長
pt_BR: Comprimento Máximo de Subdivisão
zh_Hans: 子分块最大长度
llm_description: Maximum length allowed per subchunk
max: null
min: null
name: subchunk_max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '. '
form: llm
human_description:
en_US: Separator used for subchunking
ja_JP: サブチャンク分割に使用する区切り文字
pt_BR: Separador usado para subdivisão
zh_Hans: 用于子分块的分隔符
label:
en_US: Subchunk Separator
ja_JP: サブチャンキング用セパレーター
pt_BR: Separador de Subdivisão
zh_Hans: 子分块分隔符
llm_description: The separator used to split subchunks
max: null
min: null
name: subchunk_separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: paragraph
form: llm
human_description:
en_US: Split text into paragraphs based on separator and maximum chunk
length, using split text as parent block or entire document as parent
block and directly retrieve.
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
máximo do bloco, usando o texto dividido como bloco pai ou documento
completo como bloco pai e diretamente recuperá-lo.
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
label:
en_US: Parent Mode
ja_JP: 親子モード
pt_BR: Modo Pai
zh_Hans: 父块模式
llm_description: Split text into paragraphs based on separator and maximum
chunk length, using split text as parent block or entire document as parent
block and directly retrieve.
max: null
min: null
name: parent_mode
options:
- icon: ''
label:
en_US: Paragraph
ja_JP: 段落
pt_BR: Parágrafo
zh_Hans: 段落
value: paragraph
- icon: ''
label:
en_US: Full Document
ja_JP: 全文
pt_BR: Documento Completo
zh_Hans: 全文
value: full_doc
placeholder: null
precision: null
required: true
scope: null
template: null
type: select
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove extra spaces in the text
ja_JP: テキスト内の余分なスペースを削除するかどうか
pt_BR: Se deve remover espaços extras no texto
zh_Hans: 是否移除文本中的多余空格
label:
en_US: Remove Extra Spaces
ja_JP: 余分なスペースを削除
pt_BR: Remover Espaços Extras
zh_Hans: 移除多余空格
llm_description: Whether to remove extra spaces in the text
max: null
min: null
name: remove_extra_spaces
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove URLs and emails in the text
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
pt_BR: Se deve remover URLs e e-mails no texto
zh_Hans: 是否移除文本中的URL和电子邮件地址
label:
en_US: Remove URLs and Emails
ja_JP: URLとメールアドレスを削除
pt_BR: Remover URLs e E-mails
zh_Hans: 移除URL和电子邮件地址
llm_description: Whether to remove URLs and emails in the text
max: null
min: null
name: remove_urls_emails
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
input_text: ''
max_length: ''
parent_mode: ''
remove_extra_spaces: ''
remove_urls_emails: ''
separator: ''
subchunk_max_length: ''
subchunk_separator: ''
provider_id: langgenius/parentchild_chunker/parentchild_chunker
provider_name: langgenius/parentchild_chunker/parentchild_chunker
provider_type: builtin
selected: true
title: Parent-child Chunker
tool_configurations: {}
tool_description: Parent-child Chunk Structure
tool_label: Parent-child Chunker
tool_name: parentchild_chunker
tool_parameters:
input_text:
type: mixed
value: '{{#1752489759475.content#}}'
max_length:
type: variable
value:
- rag
- shared
- max_chunk_length
parent_mode:
type: variable
value:
- rag
- shared
- parent_mode
remove_extra_spaces:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
remove_urls_emails:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
separator:
type: mixed
value: '{{#rag.shared.delimiter#}}'
subchunk_max_length:
type: variable
value:
- rag
- shared
- child_max_chunk_length
subchunk_separator:
type: mixed
value: '{{#rag.shared.child_delimiter#}}'
type: tool
height: 52
id: '1752490343805'
position:
x: 1077.0240183162543
y: 281.3910724383104
positionAbsolute:
x: 1077.0240183162543
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -487.2912544090391
y: -54.7029301848807
zoom: 0.9994011715768695
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n
label: Child delimiter
max_length: 199
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: child_delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 512
label: Child max chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: child_max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: paragraph
label: Parent mode
max_length: 48
options:
- full_doc
- paragraph
placeholder: null
required: true
tooltips: null
type: select
unit: null
variable: parent_mode
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -0,0 +1,674 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: website-crawl-general-economy
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752491761974-source-1752565435219-target
source: '1752491761974'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752565402678-source-1752565435219-target
source: '1752565402678'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752565435219-source-1752569675978-target
source: '1752565435219'
sourceHandle: source
target: '1752569675978'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752569675978-source-1752477924228-target
source: '1752569675978'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752569675978'
- result
indexing_technique: economy
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: keyword_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: true
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 2140.4053851189346
y: 281.3910724383104
positionAbsolute:
x: 2140.4053851189346
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Jina Reader
datasource_name: jina_reader
datasource_parameters:
crawl_sub_pages:
type: mixed
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
limit:
type: variable
value:
- rag
- '1752491761974'
- jina_limit
url:
type: mixed
value: '{{#rag.1752491761974.jina_url#}}'
use_sitemap:
type: mixed
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
plugin_id: langgenius/jina_datasource
provider_name: jina
provider_type: website_crawl
selected: false
title: Jina Reader
type: datasource
height: 52
id: '1752491761974'
position:
x: 1067.7526055798794
y: 281.3910724383104
positionAbsolute:
x: 1067.7526055798794
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Firecrawl
datasource_name: crawl
datasource_parameters:
crawl_subpages:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
exclude_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
include_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
limit:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_limit
max_depth:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_max_depth
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
plugin_id: langgenius/firecrawl_datasource
provider_name: firecrawl
provider_type: website_crawl
selected: false
title: Firecrawl
type: datasource
height: 52
id: '1752565402678'
position:
x: 1067.7526055798794
y: 417.32608398342404
positionAbsolute:
x: 1067.7526055798794
y: 417.32608398342404
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752491761974'
- content
- - '1752565402678'
- content
height: 129
id: '1752565435219'
position:
x: 1505.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1505.4306671642219
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752565435219.output#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752569675978'
position:
x: 1807.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1807.4306671642219
y: 281.3910724383104
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -707.721097109337
y: -93.07807382100896
zoom: 0.9350632198875476
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: jina_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: jina_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: jina_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Use sitemap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
iteratively based on page relevance, yielding fewer but higher-quality pages.
type: checkbox
unit: null
variable: jina_use_sitemap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: firecrawl_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: true
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: firecrawl_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Max depth
max_length: 48
options: []
placeholder: ''
required: false
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+ one /, and so on.
type: number
unit: null
variable: firecrawl_max_depth
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Exclude paths
max_length: 256
options: []
placeholder: blog/*, /about/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_exclude_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Include only paths
max_length: 256
options: []
placeholder: articles/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_include_only_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: firecrawl_extract_main_content
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 50
label: chunk_overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Setting the chunk overlap can maintain the semantic relevance between
them, enhancing the retrieve effect. It is recommended to set 10%25% of the
maximum chunk size.
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: replace_consecutive_spaces
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -0,0 +1,674 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: '#FFF4ED'
icon_type: emoji
name: website-crawl-general-high-quality
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752491761974-source-1752565435219-target
source: '1752491761974'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752565402678-source-1752565435219-target
source: '1752565402678'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752565435219-source-1752569675978-target
source: '1752565435219'
sourceHandle: source
target: '1752569675978'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752569675978-source-1752477924228-target
source: '1752569675978'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: text_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752569675978'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 2140.4053851189346
y: 281.3910724383104
positionAbsolute:
x: 2140.4053851189346
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Jina Reader
datasource_name: jina_reader
datasource_parameters:
crawl_sub_pages:
type: mixed
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
limit:
type: variable
value:
- rag
- '1752491761974'
- jina_limit
url:
type: mixed
value: '{{#rag.1752491761974.jina_url#}}'
use_sitemap:
type: mixed
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
plugin_id: langgenius/jina_datasource
provider_name: jina
provider_type: website_crawl
selected: false
title: Jina Reader
type: datasource
height: 52
id: '1752491761974'
position:
x: 1067.7526055798794
y: 281.3910724383104
positionAbsolute:
x: 1067.7526055798794
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Firecrawl
datasource_name: crawl
datasource_parameters:
crawl_subpages:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
exclude_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
include_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
limit:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_limit
max_depth:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_max_depth
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
plugin_id: langgenius/firecrawl_datasource
provider_name: firecrawl
provider_type: website_crawl
selected: false
title: Firecrawl
type: datasource
height: 52
id: '1752565402678'
position:
x: 1067.7526055798794
y: 417.32608398342404
positionAbsolute:
x: 1067.7526055798794
y: 417.32608398342404
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752491761974'
- content
- - '1752565402678'
- content
height: 129
id: '1752565435219'
position:
x: 1505.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1505.4306671642219
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: The result of the general chunk tool.
properties:
general_chunks:
items:
description: The chunk of the text.
type: string
type: array
type: object
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input Variable
ja_JP: 入力変数
pt_BR: Variável de entrada
zh_Hans: 输入变量
llm_description: The text you want to chunk.
max: null
min: null
name: input_variable
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The delimiter of the chunks.
ja_JP: チャンクの区切り記号。
pt_BR: O delimitador dos pedaços.
zh_Hans: 块的分隔符。
label:
en_US: Delimiter
ja_JP: 区切り記号
pt_BR: Delimitador
zh_Hans: 分隔符
llm_description: The delimiter of the chunks, the format of the delimiter
must be a string.
max: null
min: null
name: delimiter
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: null
form: llm
human_description:
en_US: The maximum chunk length.
ja_JP: 最大長のチャンク。
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度。
label:
en_US: Maximum Chunk Length
ja_JP: チャンク最大長
pt_BR: O comprimento máximo do bloco
zh_Hans: 最大块的长度
llm_description: The maximum chunk length, the format of the chunk size
must be an integer.
max: null
min: null
name: max_chunk_length
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: The chunk overlap length.
ja_JP: チャンクの重複長。
pt_BR: The chunk overlap length.
zh_Hans: 块的重叠长度。
label:
en_US: Chunk Overlap Length
ja_JP: チャンク重複長
pt_BR: Chunk Overlap Length
zh_Hans: 块的重叠长度
llm_description: The chunk overlap length, the format of the chunk overlap
length must be an integer.
max: null
min: null
name: chunk_overlap_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: null
form: llm
human_description:
en_US: Replace consecutive spaces, newlines and tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace consecutive spaces, newlines and tabs
zh_Hans: 替换连续的空格、换行符和制表符
label:
en_US: Replace Consecutive Spaces, Newlines and Tabs
ja_JP: 連続のスペース、改行、まだはタブを置換する
pt_BR: Replace Consecutive Spaces, Newlines and Tabs
zh_Hans: 替换连续的空格、换行符和制表符
llm_description: Replace consecutive spaces, newlines and tabs, the format
of the replace must be a boolean.
max: null
min: null
name: replace_consecutive_spaces_newlines_tabs
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: null
form: llm
human_description:
en_US: Delete all URLs and email addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete all URLs and email addresses
zh_Hans: 删除所有URL和电子邮件地址
label:
en_US: Delete All URLs and Email Addresses
ja_JP: すべてのURLとメールアドレスを削除する
pt_BR: Delete All URLs and Email Addresses
zh_Hans: 删除所有URL和电子邮件地址
llm_description: Delete all URLs and email addresses, the format of the
delete must be a boolean.
max: null
min: null
name: delete_all_urls_and_email_addresses
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
chunk_overlap_length: ''
delete_all_urls_and_email_addresses: ''
delimiter: ''
input_variable: ''
max_chunk_length: ''
replace_consecutive_spaces_newlines_tabs: ''
provider_id: langgenius/general_chunker/general_chunker
provider_name: langgenius/general_chunker/general_chunker
provider_type: builtin
selected: false
title: General Chunker
tool_configurations: {}
tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
tool_label: General Chunker
tool_name: general_chunker
tool_parameters:
chunk_overlap_length:
type: variable
value:
- rag
- shared
- chunk_overlap
delete_all_urls_and_email_addresses:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
delimiter:
type: mixed
value: '{{#rag.shared.delimiter#}}'
input_variable:
type: mixed
value: '{{#1752565435219.output#}}'
max_chunk_length:
type: variable
value:
- rag
- shared
- max_chunk_length
replace_consecutive_spaces_newlines_tabs:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
type: tool
height: 52
id: '1752569675978'
position:
x: 1807.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1807.4306671642219
y: 281.3910724383104
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -707.721097109337
y: -93.07807382100896
zoom: 0.9350632198875476
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: jina_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: jina_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: jina_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Use sitemap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
iteratively based on page relevance, yielding fewer but higher-quality pages.
type: checkbox
unit: null
variable: jina_use_sitemap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: firecrawl_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: true
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: firecrawl_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Max depth
max_length: 48
options: []
placeholder: ''
required: false
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+ one /, and so on.
type: number
unit: null
variable: firecrawl_max_depth
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Exclude paths
max_length: 256
options: []
placeholder: blog/*, /about/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_exclude_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Include only paths
max_length: 256
options: []
placeholder: articles/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_include_only_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: firecrawl_extract_main_content
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: Delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 50
label: chunk_overlap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Setting the chunk overlap can maintain the semantic relevance between
them, enhancing the retrieve effect. It is recommended to set 10%25% of the
maximum chunk size.
type: number
unit: characters
variable: chunk_overlap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: replace_consecutive_spaces
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -0,0 +1,779 @@
dependencies:
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
- current_identifier: null
type: marketplace
value:
plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
kind: rag_pipeline
rag_pipeline:
description: ''
icon: 📙
icon_background: ''
icon_type: emoji
name: website-crawl-parentchild
version: 0.1.0
workflow:
conversation_variables: []
environment_variables: []
features: {}
graph:
edges:
- data:
isInLoop: false
sourceType: tool
targetType: knowledge-index
id: 1752490343805-source-1752477924228-target
source: '1752490343805'
sourceHandle: source
target: '1752477924228'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752491761974-source-1752565435219-target
source: '1752491761974'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
- data:
isInIteration: false
isInLoop: false
sourceType: variable-aggregator
targetType: tool
id: 1752565435219-source-1752490343805-target
source: '1752565435219'
sourceHandle: source
target: '1752490343805'
targetHandle: target
type: custom
zIndex: 0
- data:
isInLoop: false
sourceType: datasource
targetType: variable-aggregator
id: 1752565402678-source-1752565435219-target
source: '1752565402678'
sourceHandle: source
target: '1752565435219'
targetHandle: target
type: custom
zIndex: 0
nodes:
- data:
chunk_structure: hierarchical_model
embedding_model: text-embedding-ada-002
embedding_model_provider: langgenius/openai/openai
index_chunk_variable_selector:
- '1752490343805'
- result
indexing_technique: high_quality
keyword_number: 10
retrieval_model:
score_threshold: 0.5
score_threshold_enabled: false
search_method: semantic_search
top_k: 3
vector_setting:
embedding_model_name: text-embedding-ada-002
embedding_provider_name: langgenius/openai/openai
selected: false
title: Knowledge Base
type: knowledge-index
height: 114
id: '1752477924228'
position:
x: 2215.5544306817387
y: 281.3910724383104
positionAbsolute:
x: 2215.5544306817387
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
is_team_authorization: true
output_schema:
properties:
result:
description: Parent child chunks result
items:
type: object
type: array
type: object
paramSchemas:
- auto_generate: null
default: null
form: llm
human_description:
en_US: The text you want to chunk.
ja_JP: チャンク化したいテキスト。
pt_BR: O texto que você deseja dividir.
zh_Hans: 你想要分块的文本。
label:
en_US: Input text
ja_JP: 入力テキスト
pt_BR: Texto de entrada
zh_Hans: 输入文本
llm_description: The text you want to chunk.
max: null
min: null
name: input_text
options: []
placeholder: null
precision: null
required: true
scope: null
template: null
type: string
- auto_generate: null
default: 1024
form: llm
human_description:
en_US: Maximum length for chunking
ja_JP: チャンク分割の最大長
pt_BR: Comprimento máximo para divisão
zh_Hans: 用于分块的最大长度
label:
en_US: Maximum Length
ja_JP: 最大長
pt_BR: Comprimento Máximo
zh_Hans: 最大长度
llm_description: Maximum length allowed per chunk
max: null
min: null
name: max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '
'
form: llm
human_description:
en_US: Separator used for chunking
ja_JP: チャンク分割に使用する区切り文字
pt_BR: Separador usado para divisão
zh_Hans: 用于分块的分隔符
label:
en_US: Chunk Separator
ja_JP: チャンク区切り文字
pt_BR: Separador de Divisão
zh_Hans: 分块分隔符
llm_description: The separator used to split chunks
max: null
min: null
name: separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: 512
form: llm
human_description:
en_US: Maximum length for subchunking
ja_JP: サブチャンク分割の最大長
pt_BR: Comprimento máximo para subdivisão
zh_Hans: 用于子分块的最大长度
label:
en_US: Subchunk Maximum Length
ja_JP: サブチャンク最大長
pt_BR: Comprimento Máximo de Subdivisão
zh_Hans: 子分块最大长度
llm_description: Maximum length allowed per subchunk
max: null
min: null
name: subchunk_max_length
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: number
- auto_generate: null
default: '. '
form: llm
human_description:
en_US: Separator used for subchunking
ja_JP: サブチャンク分割に使用する区切り文字
pt_BR: Separador usado para subdivisão
zh_Hans: 用于子分块的分隔符
label:
en_US: Subchunk Separator
ja_JP: サブチャンキング用セパレーター
pt_BR: Separador de Subdivisão
zh_Hans: 子分块分隔符
llm_description: The separator used to split subchunks
max: null
min: null
name: subchunk_separator
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: string
- auto_generate: null
default: paragraph
form: llm
human_description:
en_US: Split text into paragraphs based on separator and maximum chunk
length, using split text as parent block or entire document as parent
block and directly retrieve.
ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
máximo do bloco, usando o texto dividido como bloco pai ou documento
completo como bloco pai e diretamente recuperá-lo.
zh_Hans: 根据分隔符和最大块长度将文本拆分为段落,使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
label:
en_US: Parent Mode
ja_JP: 親子モード
pt_BR: Modo Pai
zh_Hans: 父块模式
llm_description: Split text into paragraphs based on separator and maximum
chunk length, using split text as parent block or entire document as parent
block and directly retrieve.
max: null
min: null
name: parent_mode
options:
- icon: ''
label:
en_US: Paragraph
ja_JP: 段落
pt_BR: Parágrafo
zh_Hans: 段落
value: paragraph
- icon: ''
label:
en_US: Full Document
ja_JP: 全文
pt_BR: Documento Completo
zh_Hans: 全文
value: full_doc
placeholder: null
precision: null
required: true
scope: null
template: null
type: select
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove extra spaces in the text
ja_JP: テキスト内の余分なスペースを削除するかどうか
pt_BR: Se deve remover espaços extras no texto
zh_Hans: 是否移除文本中的多余空格
label:
en_US: Remove Extra Spaces
ja_JP: 余分なスペースを削除
pt_BR: Remover Espaços Extras
zh_Hans: 移除多余空格
llm_description: Whether to remove extra spaces in the text
max: null
min: null
name: remove_extra_spaces
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
- auto_generate: null
default: 0
form: llm
human_description:
en_US: Whether to remove URLs and emails in the text
ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
pt_BR: Se deve remover URLs e e-mails no texto
zh_Hans: 是否移除文本中的URL和电子邮件地址
label:
en_US: Remove URLs and Emails
ja_JP: URLとメールアドレスを削除
pt_BR: Remover URLs e E-mails
zh_Hans: 移除URL和电子邮件地址
llm_description: Whether to remove URLs and emails in the text
max: null
min: null
name: remove_urls_emails
options: []
placeholder: null
precision: null
required: false
scope: null
template: null
type: boolean
params:
input_text: ''
max_length: ''
parent_mode: ''
remove_extra_spaces: ''
remove_urls_emails: ''
separator: ''
subchunk_max_length: ''
subchunk_separator: ''
provider_id: langgenius/parentchild_chunker/parentchild_chunker
provider_name: langgenius/parentchild_chunker/parentchild_chunker
provider_type: builtin
selected: true
title: Parent-child Chunker
tool_configurations: {}
tool_description: Parent-child Chunk Structure
tool_label: Parent-child Chunker
tool_name: parentchild_chunker
tool_parameters:
input_text:
type: mixed
value: '{{#1752565435219.output#}}'
max_length:
type: variable
value:
- rag
- shared
- max_chunk_length
parent_mode:
type: variable
value:
- rag
- shared
- parent_mode
remove_extra_spaces:
type: mixed
value: '{{#rag.shared.replace_consecutive_spaces#}}'
remove_urls_emails:
type: mixed
value: '{{#rag.shared.delete_urls_email#}}'
separator:
type: mixed
value: '{{#rag.shared.delimiter#}}'
subchunk_max_length:
type: variable
value:
- rag
- shared
- child_max_chunk_length
subchunk_separator:
type: mixed
value: '{{#rag.shared.child_delimiter#}}'
type: tool
height: 52
id: '1752490343805'
position:
x: 1853.5260563244174
y: 281.3910724383104
positionAbsolute:
x: 1853.5260563244174
y: 281.3910724383104
selected: true
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Jina Reader
datasource_name: jina_reader
datasource_parameters:
crawl_sub_pages:
type: mixed
value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
limit:
type: variable
value:
- rag
- '1752491761974'
- jina_limit
url:
type: mixed
value: '{{#rag.1752491761974.jina_url#}}'
use_sitemap:
type: mixed
value: '{{#rag.1752491761974.jina_use_sitemap#}}'
plugin_id: langgenius/jina_datasource
provider_name: jina
provider_type: website_crawl
selected: false
title: Jina Reader
type: datasource
height: 52
id: '1752491761974'
position:
x: 1067.7526055798794
y: 281.3910724383104
positionAbsolute:
x: 1067.7526055798794
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
datasource_configurations: {}
datasource_label: Firecrawl
datasource_name: crawl
datasource_parameters:
crawl_subpages:
type: mixed
value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
exclude_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
include_paths:
type: mixed
value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
limit:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_limit
max_depth:
type: variable
value:
- rag
- '1752565402678'
- firecrawl_max_depth
only_main_content:
type: mixed
value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
url:
type: mixed
value: '{{#rag.1752565402678.firecrawl_url#}}'
plugin_id: langgenius/firecrawl_datasource
provider_name: firecrawl
provider_type: website_crawl
selected: false
title: Firecrawl
type: datasource
height: 52
id: '1752565402678'
position:
x: 1067.7526055798794
y: 417.32608398342404
positionAbsolute:
x: 1067.7526055798794
y: 417.32608398342404
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
- data:
output_type: string
selected: false
title: Variable Aggregator
type: variable-aggregator
variables:
- - '1752491761974'
- content
- - '1752565402678'
- content
height: 129
id: '1752565435219'
position:
x: 1505.4306671642219
y: 281.3910724383104
positionAbsolute:
x: 1505.4306671642219
y: 281.3910724383104
selected: false
sourcePosition: right
targetPosition: left
type: custom
width: 242
viewport:
x: -826.1791044466438
y: -71.91725474841303
zoom: 0.9980166672552107
rag_pipeline_variables:
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: jina_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: jina_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: jina_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752491761974'
default_value: null
label: Use sitemap
max_length: 48
options: []
placeholder: null
required: false
tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
iteratively based on page relevance, yielding fewer but higher-quality pages.
type: checkbox
unit: null
variable: jina_use_sitemap
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: URL
max_length: 256
options: []
placeholder: https://docs.dify.ai/en/
required: true
tooltips: null
type: text-input
unit: null
variable: firecrawl_url
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: true
label: Crawl sub-pages
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_crawl_sub_pages
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: 10
label: Limit
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: null
variable: firecrawl_limit
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Max depth
max_length: 48
options: []
placeholder: ''
required: false
tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+ one /, and so on.
type: number
unit: null
variable: firecrawl_max_depth
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Exclude paths
max_length: 256
options: []
placeholder: blog/*, /about/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_exclude_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: Include only paths
max_length: 256
options: []
placeholder: articles/*
required: false
tooltips: null
type: text-input
unit: null
variable: firecrawl_include_only_paths
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: '1752565402678'
default_value: null
label: firecrawl_extract_main_content
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: firecrawl_extract_main_content
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n\n
label: delimiter
max_length: 100
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 1024
label: Maximum chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: \n
label: Child delimiter
max_length: 199
options: []
placeholder: null
required: true
tooltips: A delimiter is the character used to separate text. \n\n is recommended
for splitting the original document into large parent chunks. You can also use
special delimiters defined by yourself.
type: text-input
unit: null
variable: child_delimiter
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: 512
label: Child max chunk length
max_length: 48
options: []
placeholder: null
required: true
tooltips: null
type: number
unit: characters
variable: child_max_chunk_length
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: paragraph
label: Parent mode
max_length: 48
options:
- full_doc
- paragraph
placeholder: null
required: true
tooltips: null
type: select
unit: null
variable: parent_mode
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Replace consecutive spaces, newlines and tabs
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: replace_consecutive_spaces
- allow_file_extension: null
allow_file_upload_methods: null
allowed_file_types: null
belong_to_node_id: shared
default_value: null
label: Delete all URLs and email addresses
max_length: 48
options: []
placeholder: null
required: false
tooltips: null
type: checkbox
unit: null
variable: delete_urls_email

View file

@ -1,6 +1,5 @@
import json
import logging
import re
from collections.abc import Mapping
from pathlib import Path
from typing import Any
@ -10,9 +9,9 @@ from sqlalchemy.orm import Session
from configs import dify_config
from constants import HIDDEN_VALUE, UNKNOWN_VALUE
from core.helper.name_generator import generate_incremental_name
from core.helper.position_helper import is_filtered
from core.helper.provider_cache import NoOpProviderCredentialCache, ToolProviderCredentialsCache
from core.plugin.entities.plugin import ToolProviderID
from core.tools.builtin_tool.provider import BuiltinToolProviderController
from core.tools.builtin_tool.providers._positions import BuiltinToolProviderSort
from core.tools.entities.api_entities import (
@ -30,6 +29,7 @@ from core.tools.utils.encryption import create_provider_encrypter
from core.tools.utils.system_oauth_encryption import decrypt_system_oauth_params
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from models.provider_ids import ToolProviderID
from models.tools import BuiltinToolProvider, ToolOAuthSystemClient, ToolOAuthTenantClient
from services.plugin.plugin_service import PluginService
from services.tools.tools_transform_service import ToolTransformService
@ -311,42 +311,20 @@ class BuiltinToolManageService:
def generate_builtin_tool_provider_name(
session: Session, tenant_id: str, provider: str, credential_type: CredentialType
) -> str:
try:
db_providers = (
session.query(BuiltinToolProvider)
.filter_by(
tenant_id=tenant_id,
provider=provider,
credential_type=credential_type.value,
)
.order_by(BuiltinToolProvider.created_at.desc())
.all()
db_providers = (
session.query(BuiltinToolProvider)
.filter_by(
tenant_id=tenant_id,
provider=provider,
credential_type=credential_type.value,
)
# Get the default name pattern
default_pattern = f"{credential_type.get_name()}"
# Find all names that match the default pattern: "{default_pattern} {number}"
pattern = rf"^{re.escape(default_pattern)}\s+(\d+)$"
numbers = []
for db_provider in db_providers:
if db_provider.name:
match = re.match(pattern, db_provider.name.strip())
if match:
numbers.append(int(match.group(1)))
# If no default pattern names found, start with 1
if not numbers:
return f"{default_pattern} 1"
# Find the next number
max_number = max(numbers)
return f"{default_pattern} {max_number + 1}"
except Exception as e:
logger.warning("Error generating next provider name for %s: %s", provider, str(e))
# fallback
return f"{credential_type.get_name()} 1"
.order_by(BuiltinToolProvider.created_at.desc())
.all()
)
return generate_incremental_name(
[provider.name for provider in db_providers],
f"{credential_type.get_name()}",
)
@staticmethod
def get_builtin_tool_provider_credentials(

View file

@ -1,12 +1,14 @@
import json
import logging
from typing import Any, Union, cast
from collections.abc import Mapping
from typing import Any, Union
from yarl import URL
from configs import dify_config
from core.helper.provider_cache import ToolProviderCredentialsCache
from core.mcp.types import Tool as MCPTool
from core.plugin.entities.plugin_daemon import PluginDatasourceProviderEntity
from core.tools.__base.tool import Tool
from core.tools.__base.tool_runtime import ToolRuntime
from core.tools.builtin_tool.provider import BuiltinToolProviderController
@ -38,7 +40,9 @@ class ToolTransformService:
return str(url_prefix % {"tenant_id": tenant_id, "filename": filename})
@classmethod
def get_tool_provider_icon_url(cls, provider_type: str, provider_name: str, icon: str | dict) -> Union[str, dict]:
def get_tool_provider_icon_url(
cls, provider_type: str, provider_name: str, icon: str | Mapping[str, str]
) -> str | Mapping[str, str]:
"""
get tool provider icon url
"""
@ -51,7 +55,7 @@ class ToolTransformService:
elif provider_type in {ToolProviderType.API.value, ToolProviderType.WORKFLOW.value}:
try:
if isinstance(icon, str):
return cast(dict, json.loads(icon))
return json.loads(icon)
return icon
except Exception:
return {"background": "#252525", "content": "\ud83d\ude01"}
@ -60,7 +64,7 @@ class ToolTransformService:
return ""
@staticmethod
def repack_provider(tenant_id: str, provider: Union[dict, ToolProviderApiEntity]):
def repack_provider(tenant_id: str, provider: Union[dict, ToolProviderApiEntity, PluginDatasourceProviderEntity]):
"""
repack provider
@ -89,6 +93,12 @@ class ToolTransformService:
provider.icon_dark = ToolTransformService.get_tool_provider_icon_url(
provider_type=provider.type.value, provider_name=provider.name, icon=provider.icon_dark
)
elif isinstance(provider, PluginDatasourceProviderEntity):
if provider.plugin_id:
if isinstance(provider.declaration.identity.icon, str):
provider.declaration.identity.icon = ToolTransformService.get_plugin_icon_url(
tenant_id=tenant_id, filename=provider.declaration.identity.icon
)
@classmethod
def builtin_provider_to_user_provider(
@ -106,7 +116,7 @@ class ToolTransformService:
name=provider_controller.entity.identity.name,
description=provider_controller.entity.identity.description,
icon=provider_controller.entity.identity.icon,
icon_dark=provider_controller.entity.identity.icon_dark,
icon_dark=provider_controller.entity.identity.icon_dark or "",
label=provider_controller.entity.identity.label,
type=ToolProviderType.BUILT_IN,
masked_credentials={},
@ -128,9 +138,10 @@ class ToolTransformService:
)
}
masked_creds = {}
for name in schema:
if result.masked_credentials:
result.masked_credentials[name] = ""
masked_creds[name] = ""
result.masked_credentials = masked_creds
# check if the provider need credentials
if not provider_controller.need_credentials:
@ -208,7 +219,7 @@ class ToolTransformService:
name=provider_controller.entity.identity.name,
description=provider_controller.entity.identity.description,
icon=provider_controller.entity.identity.icon,
icon_dark=provider_controller.entity.identity.icon_dark,
icon_dark=provider_controller.entity.identity.icon_dark or "",
label=provider_controller.entity.identity.label,
type=ToolProviderType.WORKFLOW,
masked_credentials={},
@ -321,7 +332,7 @@ class ToolTransformService:
@staticmethod
def convert_tool_entity_to_api_entity(
tool: Union[ApiToolBundle, WorkflowTool, Tool],
tool: ApiToolBundle | WorkflowTool | Tool,
tenant_id: str,
labels: list[str] | None = None,
) -> ToolApiEntity:
@ -375,7 +386,7 @@ class ToolTransformService:
parameters=merged_parameters,
labels=labels or [],
)
elif isinstance(tool, ApiToolBundle):
else:
return ToolApiEntity(
author=tool.author,
name=tool.operation_id or "",
@ -384,9 +395,6 @@ class ToolTransformService:
parameters=tool.parameters,
labels=labels or [],
)
else:
# Handle WorkflowTool case
raise ValueError(f"Unsupported tool type: {type(tool)}")
@staticmethod
def convert_builtin_provider_to_credential_entity(

View file

@ -0,0 +1,394 @@
import dataclasses
from collections.abc import Mapping
from typing import Any, Generic, TypeAlias, TypeVar, overload
from configs import dify_config
from core.file.models import File
from core.variables.segments import (
ArrayFileSegment,
ArraySegment,
BooleanSegment,
FileSegment,
FloatSegment,
IntegerSegment,
NoneSegment,
ObjectSegment,
Segment,
StringSegment,
)
from core.variables.utils import dumps_with_segments
_MAX_DEPTH = 100
class _QAKeys:
"""dict keys for _QAStructure"""
QA_CHUNKS = "qa_chunks"
QUESTION = "question"
ANSWER = "answer"
class _PCKeys:
"""dict keys for _ParentChildStructure"""
PARENT_MODE = "parent_mode"
PARENT_CHILD_CHUNKS = "parent_child_chunks"
PARENT_CONTENT = "parent_content"
CHILD_CONTENTS = "child_contents"
_T = TypeVar("_T")
@dataclasses.dataclass(frozen=True)
class _PartResult(Generic[_T]):
value: _T
value_size: int
truncated: bool
class MaxDepthExceededError(Exception):
pass
class UnknownTypeError(Exception):
pass
JSONTypes: TypeAlias = int | float | str | list | dict | None | bool
@dataclasses.dataclass(frozen=True)
class TruncationResult:
result: Segment
truncated: bool
class VariableTruncator:
"""
Handles variable truncation with structure-preserving strategies.
This class implements intelligent truncation that prioritizes maintaining data structure
integrity while ensuring the final size doesn't exceed specified limits.
Uses recursive size calculation to avoid repeated JSON serialization.
"""
def __init__(
self,
string_length_limit=5000,
array_element_limit: int = 20,
max_size_bytes: int = 1024_000, # 100KB
):
if string_length_limit <= 3:
raise ValueError("string_length_limit should be greater than 3.")
self._string_length_limit = string_length_limit
if array_element_limit <= 0:
raise ValueError("array_element_limit should be greater than 0.")
self._array_element_limit = array_element_limit
if max_size_bytes <= 0:
raise ValueError("max_size_bytes should be greater than 0.")
self._max_size_bytes = max_size_bytes
@classmethod
def default(cls) -> "VariableTruncator":
return VariableTruncator(
max_size_bytes=dify_config.WORKFLOW_VARIABLE_TRUNCATION_MAX_SIZE,
array_element_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_ARRAY_LENGTH,
string_length_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_STRING_LENGTH,
)
def truncate_variable_mapping(self, v: Mapping[str, Any]) -> tuple[Mapping[str, Any], bool]:
"""
`truncate_variable_mapping` is responsible for truncating variable mappings
generated during workflow execution, such as `inputs`, `process_data`, or `outputs`
of a WorkflowNodeExecution record. This ensures the mappings remain within the
specified size limits while preserving their structure.
"""
budget = self._max_size_bytes
is_truncated = False
truncated_mapping: dict[str, Any] = {}
length = len(v.items())
used_size = 0
for key, value in v.items():
used_size += self.calculate_json_size(key)
if used_size > budget:
truncated_mapping[key] = "..."
continue
value_budget = (budget - used_size) // (length - len(truncated_mapping))
if isinstance(value, Segment):
part_result = self._truncate_segment(value, value_budget)
else:
part_result = self._truncate_json_primitives(value, value_budget)
is_truncated = is_truncated or part_result.truncated
truncated_mapping[key] = part_result.value
used_size += part_result.value_size
return truncated_mapping, is_truncated
@staticmethod
def _segment_need_truncation(segment: Segment) -> bool:
if isinstance(
segment,
(NoneSegment, FloatSegment, IntegerSegment, FileSegment, BooleanSegment, ArrayFileSegment),
):
return False
return True
@staticmethod
def _json_value_needs_truncation(value: Any) -> bool:
if value is None:
return False
if isinstance(value, (bool, int, float)):
return False
return True
def truncate(self, segment: Segment) -> TruncationResult:
if isinstance(segment, StringSegment):
result = self._truncate_segment(segment, self._string_length_limit)
else:
result = self._truncate_segment(segment, self._max_size_bytes)
if result.value_size > self._max_size_bytes:
if isinstance(result.value, str):
result = self._truncate_string(result.value, self._max_size_bytes)
return TruncationResult(StringSegment(value=result.value), True)
# Apply final fallback - convert to JSON string and truncate
json_str = dumps_with_segments(result.value, ensure_ascii=False)
if len(json_str) > self._max_size_bytes:
json_str = json_str[: self._max_size_bytes] + "..."
return TruncationResult(result=StringSegment(value=json_str), truncated=True)
return TruncationResult(
result=segment.model_copy(update={"value": result.value.value}), truncated=result.truncated
)
def _truncate_segment(self, segment: Segment, target_size: int) -> _PartResult[Segment]:
"""
Apply smart truncation to a variable value.
Args:
value: The value to truncate (can be Segment or raw value)
Returns:
TruncationResult with truncated data and truncation status
"""
if not VariableTruncator._segment_need_truncation(segment):
return _PartResult(segment, self.calculate_json_size(segment.value), False)
result: _PartResult[Any]
# Apply type-specific truncation with target size
if isinstance(segment, ArraySegment):
result = self._truncate_array(segment.value, target_size)
elif isinstance(segment, StringSegment):
result = self._truncate_string(segment.value, target_size)
elif isinstance(segment, ObjectSegment):
result = self._truncate_object(segment.value, target_size)
else:
raise AssertionError("this should be unreachable.")
return _PartResult(
value=segment.model_copy(update={"value": result.value}),
value_size=result.value_size,
truncated=result.truncated,
)
@staticmethod
def calculate_json_size(value: Any, depth=0) -> int:
"""Recursively calculate JSON size without serialization."""
if isinstance(value, Segment):
return VariableTruncator.calculate_json_size(value.value)
if depth > _MAX_DEPTH:
raise MaxDepthExceededError()
if isinstance(value, str):
# Ideally, the size of strings should be calculated based on their utf-8 encoded length.
# However, this adds complexity as we would need to compute encoded sizes consistently
# throughout the code. Therefore, we approximate the size using the string's length.
# Rough estimate: number of characters, plus 2 for quotes
return len(value) + 2
elif isinstance(value, (int, float)):
return len(str(value))
elif isinstance(value, bool):
return 4 if value else 5 # "true" or "false"
elif value is None:
return 4 # "null"
elif isinstance(value, list):
# Size = sum of elements + separators + brackets
total = 2 # "[]"
for i, item in enumerate(value):
if i > 0:
total += 1 # ","
total += VariableTruncator.calculate_json_size(item, depth=depth + 1)
return total
elif isinstance(value, dict):
# Size = sum of keys + values + separators + brackets
total = 2 # "{}"
for index, key in enumerate(value.keys()):
if index > 0:
total += 1 # ","
total += VariableTruncator.calculate_json_size(str(key), depth=depth + 1) # Key as string
total += 1 # ":"
total += VariableTruncator.calculate_json_size(value[key], depth=depth + 1)
return total
elif isinstance(value, File):
return VariableTruncator.calculate_json_size(value.model_dump(), depth=depth + 1)
else:
raise UnknownTypeError(f"got unknown type {type(value)}")
def _truncate_string(self, value: str, target_size: int) -> _PartResult[str]:
if (size := self.calculate_json_size(value)) < target_size:
return _PartResult(value, size, False)
if target_size < 5:
return _PartResult("...", 5, True)
truncated_size = min(self._string_length_limit, target_size - 5)
truncated_value = value[:truncated_size] + "..."
return _PartResult(truncated_value, self.calculate_json_size(truncated_value), True)
def _truncate_array(self, value: list, target_size: int) -> _PartResult[list]:
"""
Truncate array with correct strategy:
1. First limit to 20 items
2. If still too large, truncate individual items
"""
truncated_value: list[Any] = []
truncated = False
used_size = self.calculate_json_size([])
target_length = self._array_element_limit
for i, item in enumerate(value):
if i >= target_length:
return _PartResult(truncated_value, used_size, True)
if i > 0:
used_size += 1 # Account for comma
if used_size > target_size:
break
part_result = self._truncate_json_primitives(item, target_size - used_size)
truncated_value.append(part_result.value)
used_size += part_result.value_size
truncated = part_result.truncated
return _PartResult(truncated_value, used_size, truncated)
@classmethod
def _maybe_qa_structure(cls, m: Mapping[str, Any]) -> bool:
qa_chunks = m.get(_QAKeys.QA_CHUNKS)
if qa_chunks is None:
return False
if not isinstance(qa_chunks, list):
return False
return True
@classmethod
def _maybe_parent_child_structure(cls, m: Mapping[str, Any]) -> bool:
parent_mode = m.get(_PCKeys.PARENT_MODE)
if parent_mode is None:
return False
if not isinstance(parent_mode, str):
return False
parent_child_chunks = m.get(_PCKeys.PARENT_CHILD_CHUNKS)
if parent_child_chunks is None:
return False
if not isinstance(parent_child_chunks, list):
return False
return True
def _truncate_object(self, mapping: Mapping[str, Any], target_size: int) -> _PartResult[Mapping[str, Any]]:
"""
Truncate object with key preservation priority.
Strategy:
1. Keep all keys, truncate values to fit within budget
2. If still too large, drop keys starting from the end
"""
if not mapping:
return _PartResult(mapping, self.calculate_json_size(mapping), False)
truncated_obj = {}
truncated = False
used_size = self.calculate_json_size({})
# Sort keys to ensure deterministic behavior
sorted_keys = sorted(mapping.keys())
for i, key in enumerate(sorted_keys):
if used_size > target_size:
# No more room for additional key-value pairs
truncated = True
break
pair_size = 0
if i > 0:
pair_size += 1 # Account for comma
# Calculate budget for this key-value pair
# do not try to truncate keys, as we want to keep the structure of
# object.
key_size = self.calculate_json_size(key) + 1 # +1 for ":"
pair_size += key_size
remaining_pairs = len(sorted_keys) - i
value_budget = max(0, (target_size - pair_size - used_size) // remaining_pairs)
if value_budget <= 0:
truncated = True
break
# Truncate the value to fit within budget
value = mapping[key]
if isinstance(value, Segment):
value_result = self._truncate_segment(value, value_budget)
else:
value_result = self._truncate_json_primitives(mapping[key], value_budget)
truncated_obj[key] = value_result.value
pair_size += value_result.value_size
used_size += pair_size
if value_result.truncated:
truncated = True
return _PartResult(truncated_obj, used_size, truncated)
@overload
def _truncate_json_primitives(self, val: str, target_size: int) -> _PartResult[str]: ...
@overload
def _truncate_json_primitives(self, val: list, target_size: int) -> _PartResult[list]: ...
@overload
def _truncate_json_primitives(self, val: dict, target_size: int) -> _PartResult[dict]: ...
@overload
def _truncate_json_primitives(self, val: bool, target_size: int) -> _PartResult[bool]: ... # type: ignore
@overload
def _truncate_json_primitives(self, val: int, target_size: int) -> _PartResult[int]: ...
@overload
def _truncate_json_primitives(self, val: float, target_size: int) -> _PartResult[float]: ...
@overload
def _truncate_json_primitives(self, val: None, target_size: int) -> _PartResult[None]: ...
def _truncate_json_primitives(
self, val: str | list | dict | bool | int | float | None, target_size: int
) -> _PartResult[Any]:
"""Truncate a value within an object to fit within budget."""
if isinstance(val, str):
return self._truncate_string(val, target_size)
elif isinstance(val, list):
return self._truncate_array(val, target_size)
elif isinstance(val, dict):
return self._truncate_object(val, target_size)
elif val is None or isinstance(val, (bool, int, float)):
return _PartResult(val, self.calculate_json_size(val), False)
else:
raise AssertionError("this statement should be unreachable.")

View file

@ -11,7 +11,7 @@ from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
from core.rag.extractor.watercrawl.provider import WaterCrawlProvider
from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from services.auth.api_key_auth_service import ApiKeyAuthService
from services.datasource_provider_service import DatasourceProviderService
@dataclass
@ -103,7 +103,6 @@ class WebsiteCrawlStatusApiRequest:
def from_args(cls, args: dict, job_id: str) -> "WebsiteCrawlStatusApiRequest":
"""Create from Flask-RESTful parsed arguments."""
provider = args.get("provider")
if not provider:
raise ValueError("Provider is required")
if not job_id:
@ -116,12 +115,28 @@ class WebsiteService:
"""Service class for website crawling operations using different providers."""
@classmethod
def _get_credentials_and_config(cls, tenant_id: str, provider: str) -> tuple[dict, dict]:
def _get_credentials_and_config(cls, tenant_id: str, provider: str) -> tuple[Any, Any]:
"""Get and validate credentials for a provider."""
credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
if not credentials or "config" not in credentials:
raise ValueError("No valid credentials found for the provider")
return credentials, credentials["config"]
if provider == "firecrawl":
plugin_id = "langgenius/firecrawl_datasource"
elif provider == "watercrawl":
plugin_id = "langgenius/watercrawl_datasource"
elif provider == "jinareader":
plugin_id = "langgenius/jina_datasource"
else:
raise ValueError("Invalid provider")
datasource_provider_service = DatasourceProviderService()
credential = datasource_provider_service.get_datasource_credentials(
tenant_id=tenant_id,
provider=provider,
plugin_id=plugin_id,
)
if provider == "firecrawl":
return credential.get("firecrawl_api_key"), credential
elif provider in {"watercrawl", "jinareader"}:
return credential.get("api_key"), credential
else:
raise ValueError("Invalid provider")
@classmethod
def _get_decrypted_api_key(cls, tenant_id: str, config: dict) -> str:
@ -144,8 +159,7 @@ class WebsiteService:
"""Crawl a URL using the specified provider with typed request."""
request = api_request.to_crawl_request()
_, config = cls._get_credentials_and_config(current_user.current_tenant_id, request.provider)
api_key = cls._get_decrypted_api_key(current_user.current_tenant_id, config)
api_key, config = cls._get_credentials_and_config(current_user.current_tenant_id, request.provider)
if request.provider == "firecrawl":
return cls._crawl_with_firecrawl(request=request, api_key=api_key, config=config)
@ -207,7 +221,7 @@ class WebsiteService:
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
)
if response.json().get("code") != 200:
raise ValueError("Failed to crawl")
raise ValueError("Failed to crawl:")
return {"status": "active", "data": response.json().get("data")}
else:
response = requests.post(
@ -235,8 +249,7 @@ class WebsiteService:
@classmethod
def get_crawl_status_typed(cls, api_request: WebsiteCrawlStatusApiRequest) -> dict[str, Any]:
"""Get crawl status using typed request."""
_, config = cls._get_credentials_and_config(current_user.current_tenant_id, api_request.provider)
api_key = cls._get_decrypted_api_key(current_user.current_tenant_id, config)
api_key, config = cls._get_credentials_and_config(current_user.current_tenant_id, api_request.provider)
if api_request.provider == "firecrawl":
return cls._get_firecrawl_status(api_request.job_id, api_key, config)
@ -310,8 +323,7 @@ class WebsiteService:
@classmethod
def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[str, Any] | None:
_, config = cls._get_credentials_and_config(tenant_id, provider)
api_key = cls._get_decrypted_api_key(tenant_id, config)
api_key, config = cls._get_credentials_and_config(tenant_id, provider)
if provider == "firecrawl":
return cls._get_firecrawl_url_data(job_id, url, api_key, config)
@ -384,8 +396,7 @@ class WebsiteService:
def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict[str, Any]:
request = ScrapeRequest(provider=provider, url=url, tenant_id=tenant_id, only_main_content=only_main_content)
_, config = cls._get_credentials_and_config(tenant_id=request.tenant_id, provider=request.provider)
api_key = cls._get_decrypted_api_key(tenant_id=request.tenant_id, config=config)
api_key, config = cls._get_credentials_and_config(tenant_id=request.tenant_id, provider=request.provider)
if request.provider == "firecrawl":
return cls._scrape_with_firecrawl(request=request, api_key=api_key, config=config)

View file

@ -146,7 +146,7 @@ class WorkflowConverter:
graph=graph,
model_config=app_config.model,
prompt_template=app_config.prompt_template,
file_upload=app_config.additional_features.file_upload,
file_upload=app_config.additional_features.file_upload if app_config.additional_features else None,
external_data_variable_node_mapping=external_data_variable_node_mapping,
)

View file

@ -4,7 +4,7 @@ from datetime import datetime
from sqlalchemy import and_, func, or_, select
from sqlalchemy.orm import Session
from core.workflow.entities.workflow_execution import WorkflowExecutionStatus
from core.workflow.enums import WorkflowExecutionStatus
from models import Account, App, EndUser, WorkflowAppLog, WorkflowRun
from models.enums import CreatorUserRole

View file

@ -1,32 +1,44 @@
import dataclasses
import json
import logging
from collections.abc import Mapping, Sequence
from concurrent.futures import ThreadPoolExecutor
from enum import StrEnum
from typing import Any, ClassVar
from sqlalchemy import Engine, orm
from sqlalchemy import Engine, orm, select
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.sql.expression import and_, or_
from configs import dify_config
from core.app.entities.app_invoke_entities import InvokeFrom
from core.file.models import File
from core.variables import Segment, StringSegment, Variable
from core.variables.consts import SELECTORS_LENGTH
from core.variables.segments import ArrayFileSegment, FileSegment
from core.variables.segments import (
ArrayFileSegment,
FileSegment,
)
from core.variables.types import SegmentType
from core.variables.utils import dumps_with_segments
from core.workflow.constants import CONVERSATION_VARIABLE_NODE_ID, ENVIRONMENT_VARIABLE_NODE_ID, SYSTEM_VARIABLE_NODE_ID
from core.workflow.enums import SystemVariableKey
from core.workflow.nodes import NodeType
from core.workflow.nodes.variable_assigner.common.helpers import get_updated_variables
from core.workflow.variable_loader import VariableLoader
from extensions.ext_storage import storage
from factories.file_factory import StorageKeyLoader
from factories.variable_factory import build_segment, segment_to_variable
from libs.datetime_utils import naive_utc_now
from libs.uuid_utils import uuidv7
from models import App, Conversation
from models.account import Account
from models.enums import DraftVariableType
from models.workflow import Workflow, WorkflowDraftVariable, is_system_variable_editable
from models.workflow import Workflow, WorkflowDraftVariable, WorkflowDraftVariableFile, is_system_variable_editable
from repositories.factory import DifyAPIRepositoryFactory
from services.file_service import FileService
from services.variable_truncator import VariableTruncator
logger = logging.getLogger(__name__)
@ -37,6 +49,12 @@ class WorkflowDraftVariableList:
total: int | None = None
@dataclasses.dataclass(frozen=True)
class DraftVarFileDeletion:
draft_var_id: str
draft_var_file_id: str
class WorkflowDraftVariableError(Exception):
pass
@ -87,7 +105,26 @@ class DraftVarLoader(VariableLoader):
srv = WorkflowDraftVariableService(session)
draft_vars = srv.get_draft_variables_by_selectors(self._app_id, selectors)
# Important:
files: list[File] = []
# FileSegment and ArrayFileSegment are not subject to offloading, so their values
# can be safely accessed before any offloading logic is applied.
for draft_var in draft_vars:
value = draft_var.get_value()
if isinstance(value, FileSegment):
files.append(value.value)
elif isinstance(value, ArrayFileSegment):
files.extend(value.value)
with Session(bind=self._engine) as session:
storage_key_loader = StorageKeyLoader(session, tenant_id=self._tenant_id)
storage_key_loader.load_storage_keys(files)
offloaded_draft_vars = []
for draft_var in draft_vars:
if draft_var.is_truncated():
offloaded_draft_vars.append(draft_var)
continue
segment = draft_var.get_value()
variable = segment_to_variable(
segment=segment,
@ -99,20 +136,51 @@ class DraftVarLoader(VariableLoader):
selector_tuple = self._selector_to_tuple(variable.selector)
variable_by_selector[selector_tuple] = variable
# Important:
files: list[File] = []
for draft_var in draft_vars:
value = draft_var.get_value()
if isinstance(value, FileSegment):
files.append(value.value)
elif isinstance(value, ArrayFileSegment):
files.extend(value.value)
with Session(bind=self._engine) as session:
storage_key_loader = StorageKeyLoader(session, tenant_id=self._tenant_id)
storage_key_loader.load_storage_keys(files)
# Load offloaded variables using multithreading.
# This approach reduces loading time by querying external systems concurrently.
with ThreadPoolExecutor(max_workers=10) as executor:
offloaded_variables = executor.map(self._load_offloaded_variable, offloaded_draft_vars)
for selector, variable in offloaded_variables:
variable_by_selector[selector] = variable
return list(variable_by_selector.values())
def _load_offloaded_variable(self, draft_var: WorkflowDraftVariable) -> tuple[tuple[str, str], Variable]:
# This logic is closely tied to `WorkflowDraftVaribleService._try_offload_large_variable`
# and must remain synchronized with it.
# Ideally, these should be co-located for better maintainability.
# However, due to the current code structure, this is not straightforward.
variable_file = draft_var.variable_file
assert variable_file is not None
upload_file = variable_file.upload_file
assert upload_file is not None
content = storage.load(upload_file.key)
if variable_file.value_type == SegmentType.STRING:
# The inferenced type is StringSegment, which is not correct inside this function.
segment: Segment = StringSegment(value=content.decode())
variable = segment_to_variable(
segment=segment,
selector=draft_var.get_selector(),
id=draft_var.id,
name=draft_var.name,
description=draft_var.description,
)
return (draft_var.node_id, draft_var.name), variable
deserialized = json.loads(content)
segment = WorkflowDraftVariable.build_segment_with_type(variable_file.value_type, deserialized)
variable = segment_to_variable(
segment=segment,
selector=draft_var.get_selector(),
id=draft_var.id,
name=draft_var.name,
description=draft_var.description,
)
# No special handling needed for ArrayFileSegment, as we do not offload ArrayFileSegment
return (draft_var.node_id, draft_var.name), variable
class WorkflowDraftVariableService:
_session: Session
@ -138,13 +206,24 @@ class WorkflowDraftVariableService:
)
def get_variable(self, variable_id: str) -> WorkflowDraftVariable | None:
return self._session.query(WorkflowDraftVariable).where(WorkflowDraftVariable.id == variable_id).first()
return (
self._session.query(WorkflowDraftVariable)
.options(orm.selectinload(WorkflowDraftVariable.variable_file))
.where(WorkflowDraftVariable.id == variable_id)
.first()
)
def get_draft_variables_by_selectors(
self,
app_id: str,
selectors: Sequence[list[str]],
) -> list[WorkflowDraftVariable]:
"""
Retrieve WorkflowDraftVariable instances based on app_id and selectors.
The returned WorkflowDraftVariable objects are guaranteed to have their
associated variable_file and variable_file.upload_file relationships preloaded.
"""
ors = []
for selector in selectors:
assert len(selector) >= SELECTORS_LENGTH, f"Invalid selector to get: {selector}"
@ -159,7 +238,14 @@ class WorkflowDraftVariableService:
# combined using `UNION` to fetch all rows.
# Benchmarking indicates that both approaches yield comparable performance.
variables = (
self._session.query(WorkflowDraftVariable).where(WorkflowDraftVariable.app_id == app_id, or_(*ors)).all()
self._session.query(WorkflowDraftVariable)
.options(
orm.selectinload(WorkflowDraftVariable.variable_file).selectinload(
WorkflowDraftVariableFile.upload_file
)
)
.where(WorkflowDraftVariable.app_id == app_id, or_(*ors))
.all()
)
return variables
@ -170,8 +256,10 @@ class WorkflowDraftVariableService:
if page == 1:
total = query.count()
variables = (
# Do not load the `value` field.
query.options(orm.defer(WorkflowDraftVariable.value))
# Do not load the `value` field
query.options(
orm.defer(WorkflowDraftVariable.value, raiseload=True),
)
.order_by(WorkflowDraftVariable.created_at.desc())
.limit(limit)
.offset((page - 1) * limit)
@ -186,7 +274,11 @@ class WorkflowDraftVariableService:
WorkflowDraftVariable.node_id == node_id,
)
query = self._session.query(WorkflowDraftVariable).where(*criteria)
variables = query.order_by(WorkflowDraftVariable.created_at.desc()).all()
variables = (
query.options(orm.selectinload(WorkflowDraftVariable.variable_file))
.order_by(WorkflowDraftVariable.created_at.desc())
.all()
)
return WorkflowDraftVariableList(variables=variables)
def list_node_variables(self, app_id: str, node_id: str) -> WorkflowDraftVariableList:
@ -210,6 +302,7 @@ class WorkflowDraftVariableService:
def _get_variable(self, app_id: str, node_id: str, name: str) -> WorkflowDraftVariable | None:
variable = (
self._session.query(WorkflowDraftVariable)
.options(orm.selectinload(WorkflowDraftVariable.variable_file))
.where(
WorkflowDraftVariable.app_id == app_id,
WorkflowDraftVariable.node_id == node_id,
@ -278,7 +371,7 @@ class WorkflowDraftVariableService:
self._session.flush()
return None
outputs_dict = node_exec.outputs_dict or {}
outputs_dict = node_exec.load_full_outputs(self._session, storage) or {}
# a sentinel value used to check the absent of the output variable key.
absent = object()
@ -323,6 +416,49 @@ class WorkflowDraftVariableService:
return self._reset_node_var_or_sys_var(workflow, variable)
def delete_variable(self, variable: WorkflowDraftVariable):
if not variable.is_truncated():
self._session.delete(variable)
return
variable_query = (
select(WorkflowDraftVariable)
.options(
orm.selectinload(WorkflowDraftVariable.variable_file).selectinload(
WorkflowDraftVariableFile.upload_file
),
)
.where(WorkflowDraftVariable.id == variable.id)
)
variable_reloaded = self._session.execute(variable_query).scalars().first()
if variable_reloaded is None:
logger.warning("Associated WorkflowDraftVariable not found, draft_var_id=%s", variable.id)
self._session.delete(variable)
return
variable_file = variable_reloaded.variable_file
if variable_file is None:
logger.warning(
"Associated WorkflowDraftVariableFile not found, draft_var_id=%s, file_id=%s",
variable_reloaded.id,
variable_reloaded.file_id,
)
self._session.delete(variable)
return
upload_file = variable_file.upload_file
if upload_file is None:
logger.warning(
"Associated UploadFile not found, draft_var_id=%s, file_id=%s, upload_file_id=%s",
variable_reloaded.id,
variable_reloaded.file_id,
variable_file.upload_file_id,
)
self._session.delete(variable)
self._session.delete(variable_file)
return
storage.delete(upload_file.key)
self._session.delete(upload_file)
self._session.delete(upload_file)
self._session.delete(variable)
def delete_workflow_variables(self, app_id: str):
@ -332,6 +468,38 @@ class WorkflowDraftVariableService:
.delete(synchronize_session=False)
)
def delete_workflow_draft_variable_file(self, deletions: list[DraftVarFileDeletion]):
variable_files_query = (
select(WorkflowDraftVariableFile)
.options(orm.selectinload(WorkflowDraftVariableFile.upload_file))
.where(WorkflowDraftVariableFile.id.in_([i.draft_var_file_id for i in deletions]))
)
variable_files = self._session.execute(variable_files_query).scalars().all()
variable_files_by_id = {i.id: i for i in variable_files}
for i in deletions:
variable_file = variable_files_by_id.get(i.draft_var_file_id)
if variable_file is None:
logger.warning(
"Associated WorkflowDraftVariableFile not found, draft_var_id=%s, file_id=%s",
i.draft_var_id,
i.draft_var_file_id,
)
continue
upload_file = variable_file.upload_file
if upload_file is None:
logger.warning(
"Associated UploadFile not found, draft_var_id=%s, file_id=%s, upload_file_id=%s",
i.draft_var_id,
i.draft_var_file_id,
variable_file.upload_file_id,
)
self._session.delete(variable_file)
else:
storage.delete(upload_file.key)
self._session.delete(upload_file)
self._session.delete(variable_file)
def delete_node_variables(self, app_id: str, node_id: str):
return self._delete_node_variables(app_id, node_id)
@ -476,6 +644,7 @@ def _batch_upsert_draft_variable(
"visible": stmt.excluded.visible,
"editable": stmt.excluded.editable,
"node_execution_id": stmt.excluded.node_execution_id,
"file_id": stmt.excluded.file_id,
},
)
elif policy == _UpsertPolicy.IGNORE:
@ -495,6 +664,7 @@ def _model_to_insertion_dict(model: WorkflowDraftVariable) -> dict[str, Any]:
"value_type": model.value_type,
"value": model.value,
"node_execution_id": model.node_execution_id,
"file_id": model.file_id,
}
if model.visible is not None:
d["visible"] = model.visible
@ -524,6 +694,28 @@ def _build_segment_for_serialized_values(v: Any) -> Segment:
return build_segment(WorkflowDraftVariable.rebuild_file_types(v))
def _make_filename_trans_table() -> dict[int, str]:
linux_chars = ["/", "\x00"]
windows_chars = [
"<",
">",
":",
'"',
"/",
"\\",
"|",
"?",
"*",
]
windows_chars.extend(chr(i) for i in range(32))
trans_table = dict.fromkeys(linux_chars + windows_chars, "_")
return str.maketrans(trans_table)
_FILENAME_TRANS_TABLE = _make_filename_trans_table()
class DraftVariableSaver:
# _DUMMY_OUTPUT_IDENTITY is a placeholder output for workflow nodes.
# Its sole possible value is `None`.
@ -573,6 +765,7 @@ class DraftVariableSaver:
node_id: str,
node_type: NodeType,
node_execution_id: str,
user: Account,
enclosing_node_id: str | None = None,
):
# Important: `node_execution_id` parameter refers to the primary key (`id`) of the
@ -583,6 +776,7 @@ class DraftVariableSaver:
self._node_id = node_id
self._node_type = node_type
self._node_execution_id = node_execution_id
self._user = user
self._enclosing_node_id = enclosing_node_id
def _create_dummy_output_variable(self):
@ -692,17 +886,133 @@ class DraftVariableSaver:
else:
value_seg = _build_segment_for_serialized_values(value)
draft_vars.append(
WorkflowDraftVariable.new_node_variable(
app_id=self._app_id,
node_id=self._node_id,
self._create_draft_variable(
name=name,
node_execution_id=self._node_execution_id,
value=value_seg,
visible=self._should_variable_be_visible(self._node_id, self._node_type, name),
)
visible=True,
editable=True,
),
# WorkflowDraftVariable.new_node_variable(
# app_id=self._app_id,
# node_id=self._node_id,
# name=name,
# node_execution_id=self._node_execution_id,
# value=value_seg,
# visible=self._should_variable_be_visible(self._node_id, self._node_type, name),
# )
)
return draft_vars
def _generate_filename(self, name: str):
node_id_escaped = self._node_id.translate(_FILENAME_TRANS_TABLE)
return f"{node_id_escaped}-{name}"
def _try_offload_large_variable(
self,
name: str,
value_seg: Segment,
) -> tuple[Segment, WorkflowDraftVariableFile] | None:
# This logic is closely tied to `DraftVarLoader._load_offloaded_variable` and must remain
# synchronized with it.
# Ideally, these should be co-located for better maintainability.
# However, due to the current code structure, this is not straightforward.
truncator = VariableTruncator(
max_size_bytes=dify_config.WORKFLOW_VARIABLE_TRUNCATION_MAX_SIZE,
array_element_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_ARRAY_LENGTH,
string_length_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_STRING_LENGTH,
)
truncation_result = truncator.truncate(value_seg)
if not truncation_result.truncated:
return None
original_length = None
if isinstance(value_seg.value, (list, dict)):
original_length = len(value_seg.value)
# Prepare content for storage
if isinstance(value_seg, StringSegment):
# For string types, store as plain text
original_content_serialized = value_seg.value
content_type = "text/plain"
filename = f"{self._generate_filename(name)}.txt"
else:
# For other types, store as JSON
original_content_serialized = dumps_with_segments(value_seg.value, ensure_ascii=False)
content_type = "application/json"
filename = f"{self._generate_filename(name)}.json"
original_size = len(original_content_serialized.encode("utf-8"))
bind = self._session.get_bind()
assert isinstance(bind, Engine)
file_srv = FileService(bind)
upload_file = file_srv.upload_file(
filename=filename,
content=original_content_serialized.encode(),
mimetype=content_type,
user=self._user,
)
# Create WorkflowDraftVariableFile record
variable_file = WorkflowDraftVariableFile(
id=uuidv7(),
upload_file_id=upload_file.id,
size=original_size,
length=original_length,
value_type=value_seg.value_type,
app_id=self._app_id,
tenant_id=self._user.current_tenant_id,
user_id=self._user.id,
)
engine = bind = self._session.get_bind()
assert isinstance(engine, Engine)
with Session(bind=engine, expire_on_commit=False) as session:
session.add(variable_file)
session.commit()
return truncation_result.result, variable_file
def _create_draft_variable(
self,
*,
name: str,
value: Segment,
visible: bool = True,
editable: bool = True,
) -> WorkflowDraftVariable:
"""Create a draft variable with large variable handling and truncation."""
# Handle Segment values
offload_result = self._try_offload_large_variable(name, value)
if offload_result is None:
# Create the draft variable
draft_var = WorkflowDraftVariable.new_node_variable(
app_id=self._app_id,
node_id=self._node_id,
name=name,
node_execution_id=self._node_execution_id,
value=value,
visible=visible,
editable=editable,
)
return draft_var
else:
truncated, var_file = offload_result
# Create the draft variable
draft_var = WorkflowDraftVariable.new_node_variable(
app_id=self._app_id,
node_id=self._node_id,
name=name,
node_execution_id=self._node_execution_id,
value=truncated,
visible=visible,
editable=False,
file_id=var_file.id,
)
return draft_var
def save(
self,
process_data: Mapping[str, Any] | None = None,

View file

@ -3,7 +3,6 @@ import time
import uuid
from collections.abc import Callable, Generator, Mapping, Sequence
from typing import Any, cast
from uuid import uuid4
from sqlalchemy import exists, select
from sqlalchemy.orm import Session, sessionmaker
@ -15,22 +14,20 @@ from core.file import File
from core.repositories import DifyCoreRepositoryFactory
from core.variables import Variable
from core.variables.variables import VariableUnion
from core.workflow.entities.node_entities import NodeRunResult
from core.workflow.entities.variable_pool import VariablePool
from core.workflow.entities.workflow_node_execution import WorkflowNodeExecution, WorkflowNodeExecutionStatus
from core.workflow.entities import VariablePool, WorkflowNodeExecution
from core.workflow.enums import ErrorStrategy, WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus
from core.workflow.errors import WorkflowNodeRunFailedError
from core.workflow.graph_engine.entities.event import InNodeEvent
from core.workflow.graph_events import GraphNodeEventBase, NodeRunFailedEvent, NodeRunSucceededEvent
from core.workflow.node_events import NodeRunResult
from core.workflow.nodes import NodeType
from core.workflow.nodes.base.node import BaseNode
from core.workflow.nodes.enums import ErrorStrategy
from core.workflow.nodes.event import RunCompletedEvent
from core.workflow.nodes.event.types import NodeEvent
from core.workflow.nodes.base.node import Node
from core.workflow.nodes.node_mapping import LATEST_VERSION, NODE_TYPE_CLASSES_MAPPING
from core.workflow.nodes.start.entities import StartNodeData
from core.workflow.system_variable import SystemVariable
from core.workflow.workflow_entry import WorkflowEntry
from events.app_event import app_draft_workflow_was_synced, app_published_workflow_was_updated
from extensions.ext_database import db
from extensions.ext_storage import storage
from factories.file_factory import build_from_mapping, build_from_mappings
from libs.datetime_utils import naive_utc_now
from models.account import Account
@ -276,12 +273,13 @@ class WorkflowService:
type=draft_workflow.type,
version=Workflow.version_from_datetime(naive_utc_now()),
graph=draft_workflow.graph,
features=draft_workflow.features,
created_by=account.id,
environment_variables=draft_workflow.environment_variables,
conversation_variables=draft_workflow.conversation_variables,
marked_name=marked_name,
marked_comment=marked_comment,
rag_pipeline_variables=draft_workflow.rag_pipeline_variables,
features=draft_workflow.features,
)
# commit db session changes
@ -565,12 +563,12 @@ class WorkflowService:
# This will prevent validation errors from breaking the workflow
return []
def get_default_block_configs(self) -> list[dict]:
def get_default_block_configs(self) -> Sequence[Mapping[str, object]]:
"""
Get default block configs
"""
# return default block config
default_block_configs = []
default_block_configs: list[Mapping[str, object]] = []
for node_class_mapping in NODE_TYPE_CLASSES_MAPPING.values():
node_class = node_class_mapping[LATEST_VERSION]
default_config = node_class.get_default_config()
@ -579,7 +577,9 @@ class WorkflowService:
return default_block_configs
def get_default_block_config(self, node_type: str, filters: dict | None = None) -> dict | None:
def get_default_block_config(
self, node_type: str, filters: Mapping[str, object] | None = None
) -> Mapping[str, object]:
"""
Get default config of node.
:param node_type: node type
@ -590,12 +590,12 @@ class WorkflowService:
# return default block config
if node_type_enum not in NODE_TYPE_CLASSES_MAPPING:
return None
return {}
node_class = NODE_TYPE_CLASSES_MAPPING[node_type_enum][LATEST_VERSION]
default_config = node_class.get_default_config(filters=filters)
if not default_config:
return None
return {}
return default_config
@ -677,7 +677,7 @@ class WorkflowService:
# run draft workflow node
start_at = time.perf_counter()
node_execution = self._handle_node_run_result(
node_execution = self._handle_single_step_result(
invoke_node_fn=lambda: run,
start_at=start_at,
node_id=node_id,
@ -699,6 +699,9 @@ class WorkflowService:
if workflow_node_execution is None:
raise ValueError(f"WorkflowNodeExecution with id {node_execution.id} not found after saving")
with Session(db.engine) as session:
outputs = workflow_node_execution.load_full_outputs(session, storage)
with Session(bind=db.engine) as session, session.begin():
draft_var_saver = DraftVariableSaver(
session=session,
@ -707,8 +710,9 @@ class WorkflowService:
node_type=NodeType(workflow_node_execution.node_type),
enclosing_node_id=enclosing_node_id,
node_execution_id=node_execution.id,
user=account,
)
draft_var_saver.save(process_data=node_execution.process_data, outputs=node_execution.outputs)
draft_var_saver.save(process_data=node_execution.process_data, outputs=outputs)
session.commit()
return workflow_node_execution
@ -722,7 +726,7 @@ class WorkflowService:
# run free workflow node
start_at = time.perf_counter()
node_execution = self._handle_node_run_result(
node_execution = self._handle_single_step_result(
invoke_node_fn=lambda: WorkflowEntry.run_free_node(
node_id=node_id,
node_data=node_data,
@ -736,103 +740,131 @@ class WorkflowService:
return node_execution
def _handle_node_run_result(
def _handle_single_step_result(
self,
invoke_node_fn: Callable[[], tuple[BaseNode, Generator[NodeEvent | InNodeEvent, None, None]]],
invoke_node_fn: Callable[[], tuple[Node, Generator[GraphNodeEventBase, None, None]]],
start_at: float,
node_id: str,
) -> WorkflowNodeExecution:
try:
node, node_events = invoke_node_fn()
"""
Handle single step execution and return WorkflowNodeExecution.
node_run_result: NodeRunResult | None = None
for event in node_events:
if isinstance(event, RunCompletedEvent):
node_run_result = event.run_result
Args:
invoke_node_fn: Function to invoke node execution
start_at: Execution start time
node_id: ID of the node being executed
# sign output files
# node_run_result.outputs = WorkflowEntry.handle_special_values(node_run_result.outputs)
break
Returns:
WorkflowNodeExecution: The execution result
"""
node, node_run_result, run_succeeded, error = self._execute_node_safely(invoke_node_fn)
if not node_run_result:
raise ValueError("Node run failed with no run result")
# single step debug mode error handling return
if node_run_result.status == WorkflowNodeExecutionStatus.FAILED and node.continue_on_error:
node_error_args: dict[str, Any] = {
"status": WorkflowNodeExecutionStatus.EXCEPTION,
"error": node_run_result.error,
"inputs": node_run_result.inputs,
"metadata": {"error_strategy": node.error_strategy},
}
if node.error_strategy is ErrorStrategy.DEFAULT_VALUE:
node_run_result = NodeRunResult(
**node_error_args,
outputs={
**node.default_value_dict,
"error_message": node_run_result.error,
"error_type": node_run_result.error_type,
},
)
else:
node_run_result = NodeRunResult(
**node_error_args,
outputs={
"error_message": node_run_result.error,
"error_type": node_run_result.error_type,
},
)
run_succeeded = node_run_result.status in (
WorkflowNodeExecutionStatus.SUCCEEDED,
WorkflowNodeExecutionStatus.EXCEPTION,
)
error = node_run_result.error if not run_succeeded else None
except WorkflowNodeRunFailedError as e:
node = e.node
run_succeeded = False
node_run_result = None
error = e.error
# Create a NodeExecution domain model
# Create base node execution
node_execution = WorkflowNodeExecution(
id=str(uuid4()),
workflow_id="", # This is a single-step execution, so no workflow ID
id=str(uuid.uuid4()),
workflow_id="", # Single-step execution has no workflow ID
index=1,
node_id=node_id,
node_type=node.type_,
node_type=node.node_type,
title=node.title,
elapsed_time=time.perf_counter() - start_at,
created_at=naive_utc_now(),
finished_at=naive_utc_now(),
)
# Populate execution result data
self._populate_execution_result(node_execution, node_run_result, run_succeeded, error)
return node_execution
def _execute_node_safely(
self, invoke_node_fn: Callable[[], tuple[Node, Generator[GraphNodeEventBase, None, None]]]
) -> tuple[Node, NodeRunResult | None, bool, str | None]:
"""
Execute node safely and handle errors according to error strategy.
Returns:
Tuple of (node, node_run_result, run_succeeded, error)
"""
try:
node, node_events = invoke_node_fn()
node_run_result = next(
(
event.node_run_result
for event in node_events
if isinstance(event, (NodeRunSucceededEvent, NodeRunFailedEvent))
),
None,
)
if not node_run_result:
raise ValueError("Node execution failed - no result returned")
# Apply error strategy if node failed
if node_run_result.status == WorkflowNodeExecutionStatus.FAILED and node.error_strategy:
node_run_result = self._apply_error_strategy(node, node_run_result)
run_succeeded = node_run_result.status in (
WorkflowNodeExecutionStatus.SUCCEEDED,
WorkflowNodeExecutionStatus.EXCEPTION,
)
error = node_run_result.error if not run_succeeded else None
return node, node_run_result, run_succeeded, error
except WorkflowNodeRunFailedError as e:
node = e.node
run_succeeded = False
node_run_result = None
error = e.error
return node, node_run_result, run_succeeded, error
def _apply_error_strategy(self, node: Node, node_run_result: NodeRunResult) -> NodeRunResult:
"""Apply error strategy when node execution fails."""
# TODO(Novice): Maybe we should apply error strategy to node level?
error_outputs = {
"error_message": node_run_result.error,
"error_type": node_run_result.error_type,
}
# Add default values if strategy is DEFAULT_VALUE
if node.error_strategy is ErrorStrategy.DEFAULT_VALUE:
error_outputs.update(node.default_value_dict)
return NodeRunResult(
status=WorkflowNodeExecutionStatus.EXCEPTION,
error=node_run_result.error,
inputs=node_run_result.inputs,
metadata={WorkflowNodeExecutionMetadataKey.ERROR_STRATEGY: node.error_strategy},
outputs=error_outputs,
)
def _populate_execution_result(
self,
node_execution: WorkflowNodeExecution,
node_run_result: NodeRunResult | None,
run_succeeded: bool,
error: str | None,
) -> None:
"""Populate node execution with result data."""
if run_succeeded and node_run_result:
# Set inputs, process_data, and outputs as dictionaries (not JSON strings)
inputs = WorkflowEntry.handle_special_values(node_run_result.inputs) if node_run_result.inputs else None
process_data = (
node_execution.inputs = (
WorkflowEntry.handle_special_values(node_run_result.inputs) if node_run_result.inputs else None
)
node_execution.process_data = (
WorkflowEntry.handle_special_values(node_run_result.process_data)
if node_run_result.process_data
else None
)
outputs = node_run_result.outputs
node_execution.inputs = inputs
node_execution.process_data = process_data
node_execution.outputs = outputs
node_execution.outputs = node_run_result.outputs
node_execution.metadata = node_run_result.metadata
# Map status from WorkflowNodeExecutionStatus to NodeExecutionStatus
if node_run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED:
node_execution.status = WorkflowNodeExecutionStatus.SUCCEEDED
elif node_run_result.status == WorkflowNodeExecutionStatus.EXCEPTION:
node_execution.status = WorkflowNodeExecutionStatus.EXCEPTION
# Set status and error based on result
node_execution.status = node_run_result.status
if node_run_result.status == WorkflowNodeExecutionStatus.EXCEPTION:
node_execution.error = node_run_result.error
else:
# Set failed status and error
node_execution.status = WorkflowNodeExecutionStatus.FAILED
node_execution.error = error
return node_execution
def convert_to_workflow(self, app_model: App, account: Account, args: dict) -> App:
"""
Basic mode of chatbot app(expert mode) to workflow