feat: knowledge pipeline (#25360)

Signed-off-by: -LAN- <laipz8200@outlook.com> Co-authored-by: twwu <twwu@dify.ai> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: QuantumGhost <obelisk.reg+git@gmail.com> Co-authored-by: lyzno1 <yuanyouhuilyz@gmail.com> Co-authored-by: quicksand <quicksandzn@gmail.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: lyzno1 <92089059+lyzno1@users.noreply.github.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: nite-knite <nkCoding@gmail.com> Co-authored-by: Hanqing Zhao <sherry9277@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry <xh001x@hotmail.com>
2025-09-18 12:49:10 +08:00 · 2025-09-18 12:49:10 +08:00 · 85cda47c70
commit 85cda47c70
parent 7dadb33003
1772 changed files with 102407 additions and 31710 deletions
--- a/api/services/app_dsl_service.py
+++ b/api/services/app_dsl_service.py
@ -20,7 +20,7 @@ from configs import dify_config
 from core.helper import ssrf_proxy
 from core.model_runtime.utils.encoders import jsonable_encoder
 from core.plugin.entities.plugin import PluginDependency
-from core.workflow.nodes.enums import NodeType
+from core.workflow.enums import NodeType
 from core.workflow.nodes.knowledge_retrieval.entities import KnowledgeRetrievalNodeData
 from core.workflow.nodes.llm.entities import LLMNodeData
 from core.workflow.nodes.parameter_extractor.entities import ParameterExtractorNodeData
--- a/api/services/app_generate_service.py
+++ b/api/services/app_generate_service.py
@ -116,7 +116,6 @@ class AppGenerateService:
                            invoke_from=invoke_from,
                            streaming=streaming,
                            call_depth=0,
-                            workflow_thread_pool_id=None,
                        ),
                    ),
                    request_id,
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@ -16,9 +16,9 @@ from werkzeug.exceptions import NotFound

 from configs import dify_config
 from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
+from core.helper.name_generator import generate_incremental_name
 from core.model_manager import ModelManager
 from core.model_runtime.entities.model_entities import ModelType
-from core.plugin.entities.plugin import ModelProviderID
 from core.rag.index_processor.constant.built_in_field import BuiltInField
 from core.rag.index_processor.constant.index_type import IndexType
 from core.rag.retrieval.retrieval_methods import RetrievalMethod
@ -43,9 +43,12 @@ from models.dataset import (
    Document,
    DocumentSegment,
    ExternalKnowledgeBindings,
+    Pipeline,
 )
 from models.model import UploadFile
+from models.provider_ids import ModelProviderID
 from models.source import DataSourceOauthBinding
+from models.workflow import Workflow
 from services.entities.knowledge_entities.knowledge_entities import (
    ChildChunkUpdateArgs,
    KnowledgeConfig,
@ -53,6 +56,10 @@ from services.entities.knowledge_entities.knowledge_entities import (
    RetrievalModel,
    SegmentUpdateArgs,
 )
+from services.entities.knowledge_entities.rag_pipeline_entities import (
+    KnowledgeConfiguration,
+    RagPipelineDatasetCreateEntity,
+)
 from services.errors.account import NoPermissionError
 from services.errors.chunk import ChildChunkDeleteIndexError, ChildChunkIndexingError
 from services.errors.dataset import DatasetNameDuplicateError
@ -60,11 +67,13 @@ from services.errors.document import DocumentIndexingError
 from services.errors.file import FileNotExistsError
 from services.external_knowledge_service import ExternalDatasetService
 from services.feature_service import FeatureModel, FeatureService
+from services.rag_pipeline.rag_pipeline import RagPipelineService
 from services.tag_service import TagService
 from services.vector_service import VectorService
 from tasks.add_document_to_index_task import add_document_to_index_task
 from tasks.batch_clean_document_task import batch_clean_document_task
 from tasks.clean_notion_document_task import clean_notion_document_task
+from tasks.deal_dataset_index_update_task import deal_dataset_index_update_task
 from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
 from tasks.delete_segment_from_index_task import delete_segment_from_index_task
 from tasks.disable_segment_from_index_task import disable_segment_from_index_task
@ -256,6 +265,55 @@ class DatasetService:
        db.session.commit()
        return dataset

+    @staticmethod
+    def create_empty_rag_pipeline_dataset(
+        tenant_id: str,
+        rag_pipeline_dataset_create_entity: RagPipelineDatasetCreateEntity,
+    ):
+        if rag_pipeline_dataset_create_entity.name:
+            # check if dataset name already exists
+            if (
+                db.session.query(Dataset)
+                .filter_by(name=rag_pipeline_dataset_create_entity.name, tenant_id=tenant_id)
+                .first()
+            ):
+                raise DatasetNameDuplicateError(
+                    f"Dataset with name {rag_pipeline_dataset_create_entity.name} already exists."
+                )
+        else:
+            # generate a random name as Untitled 1 2 3 ...
+            datasets = db.session.query(Dataset).filter_by(tenant_id=tenant_id).all()
+            names = [dataset.name for dataset in datasets]
+            rag_pipeline_dataset_create_entity.name = generate_incremental_name(
+                names,
+                "Untitled",
+            )
+        if not current_user or not current_user.id:
+            raise ValueError("Current user or current user id not found")
+        pipeline = Pipeline(
+            tenant_id=tenant_id,
+            name=rag_pipeline_dataset_create_entity.name,
+            description=rag_pipeline_dataset_create_entity.description,
+            created_by=current_user.id,
+        )
+        db.session.add(pipeline)
+        db.session.flush()
+
+        dataset = Dataset(
+            tenant_id=tenant_id,
+            name=rag_pipeline_dataset_create_entity.name,
+            description=rag_pipeline_dataset_create_entity.description,
+            permission=rag_pipeline_dataset_create_entity.permission,
+            provider="vendor",
+            runtime_mode="rag_pipeline",
+            icon_info=rag_pipeline_dataset_create_entity.icon_info.model_dump(),
+            created_by=current_user.id,
+            pipeline_id=pipeline.id,
+        )
+        db.session.add(dataset)
+        db.session.commit()
+        return dataset
+
    @staticmethod
    def get_dataset(dataset_id) -> Dataset | None:
        dataset: Dataset | None = db.session.query(Dataset).filter_by(id=dataset_id).first()
@ -339,6 +397,14 @@ class DatasetService:
        dataset = DatasetService.get_dataset(dataset_id)
        if not dataset:
            raise ValueError("Dataset not found")
+            #  check if dataset name is exists
+
+        if DatasetService._has_dataset_same_name(
+            tenant_id=dataset.tenant_id,
+            dataset_id=dataset_id,
+            name=data.get("name", dataset.name),
+        ):
+            raise ValueError("Dataset name already exists")

        # Verify user has permission to update this dataset
        DatasetService.check_dataset_permission(dataset, user)
@ -349,6 +415,19 @@ class DatasetService:
        else:
            return DatasetService._update_internal_dataset(dataset, data, user)

+    @staticmethod
+    def _has_dataset_same_name(tenant_id: str, dataset_id: str, name: str):
+        dataset = (
+            db.session.query(Dataset)
+            .where(
+                Dataset.id != dataset_id,
+                Dataset.name == name,
+                Dataset.tenant_id == tenant_id,
+            )
+            .first()
+        )
+        return dataset is not None
+
    @staticmethod
    def _update_external_dataset(dataset, data, user):
        """
@ -454,17 +533,105 @@ class DatasetService:
        filtered_data["updated_at"] = naive_utc_now()
        # update Retrieval model
        filtered_data["retrieval_model"] = data["retrieval_model"]
+        # update icon info
+        if data.get("icon_info"):
+            filtered_data["icon_info"] = data.get("icon_info")

        # Update dataset in database
        db.session.query(Dataset).filter_by(id=dataset.id).update(filtered_data)
        db.session.commit()

+        # update pipeline knowledge base node data
+        DatasetService._update_pipeline_knowledge_base_node_data(dataset, user.id)
+
        # Trigger vector index task if indexing technique changed
        if action:
            deal_dataset_vector_index_task.delay(dataset.id, action)

        return dataset

+    @staticmethod
+    def _update_pipeline_knowledge_base_node_data(dataset: Dataset, updata_user_id: str):
+        """
+        Update pipeline knowledge base node data.
+        """
+        if dataset.runtime_mode != "rag_pipeline":
+            return
+
+        pipeline = db.session.query(Pipeline).filter_by(id=dataset.pipeline_id).first()
+        if not pipeline:
+            return
+
+        try:
+            rag_pipeline_service = RagPipelineService()
+            published_workflow = rag_pipeline_service.get_published_workflow(pipeline)
+            draft_workflow = rag_pipeline_service.get_draft_workflow(pipeline)
+
+            # update knowledge nodes
+            def update_knowledge_nodes(workflow_graph: str) -> str:
+                """Update knowledge-index nodes in workflow graph."""
+                data: dict[str, Any] = json.loads(workflow_graph)
+
+                nodes = data.get("nodes", [])
+                updated = False
+
+                for node in nodes:
+                    if node.get("data", {}).get("type") == "knowledge-index":
+                        try:
+                            knowledge_index_node_data = node.get("data", {})
+                            knowledge_index_node_data["embedding_model"] = dataset.embedding_model
+                            knowledge_index_node_data["embedding_model_provider"] = dataset.embedding_model_provider
+                            knowledge_index_node_data["retrieval_model"] = dataset.retrieval_model
+                            knowledge_index_node_data["chunk_structure"] = dataset.chunk_structure
+                            knowledge_index_node_data["indexing_technique"] = dataset.indexing_technique  # pyright: ignore[reportAttributeAccessIssue]
+                            knowledge_index_node_data["keyword_number"] = dataset.keyword_number
+                            node["data"] = knowledge_index_node_data
+                            updated = True
+                        except Exception:
+                            logging.exception("Failed to update knowledge node")
+                            continue
+
+                if updated:
+                    data["nodes"] = nodes
+                    return json.dumps(data)
+                return workflow_graph
+
+            # Update published workflow
+            if published_workflow:
+                updated_graph = update_knowledge_nodes(published_workflow.graph)
+                if updated_graph != published_workflow.graph:
+                    # Create new workflow version
+                    workflow = Workflow.new(
+                        tenant_id=pipeline.tenant_id,
+                        app_id=pipeline.id,
+                        type=published_workflow.type,
+                        version=str(datetime.datetime.now(datetime.UTC).replace(tzinfo=None)),
+                        graph=updated_graph,
+                        features=published_workflow.features,
+                        created_by=updata_user_id,
+                        environment_variables=published_workflow.environment_variables,
+                        conversation_variables=published_workflow.conversation_variables,
+                        rag_pipeline_variables=published_workflow.rag_pipeline_variables,
+                        marked_name="",
+                        marked_comment="",
+                    )
+                    db.session.add(workflow)
+
+            # Update draft workflow
+            if draft_workflow:
+                updated_graph = update_knowledge_nodes(draft_workflow.graph)
+                if updated_graph != draft_workflow.graph:
+                    draft_workflow.graph = updated_graph
+                    db.session.add(draft_workflow)
+
+            # Commit all changes in one transaction
+            db.session.commit()
+
+        except Exception:
+            logging.exception("Failed to update pipeline knowledge base node data")
+            db.session.rollback()
+            raise
+
    @staticmethod
    def _handle_indexing_technique_change(dataset, data, filtered_data):
        """
@ -654,6 +821,133 @@ class DatasetService:
        )
        filtered_data["collection_binding_id"] = dataset_collection_binding.id

+    @staticmethod
+    def update_rag_pipeline_dataset_settings(
+        session: Session, dataset: Dataset, knowledge_configuration: KnowledgeConfiguration, has_published: bool = False
+    ):
+        if not current_user or not current_user.current_tenant_id:
+            raise ValueError("Current user or current tenant not found")
+        dataset = session.merge(dataset)
+        if not has_published:
+            dataset.chunk_structure = knowledge_configuration.chunk_structure
+            dataset.indexing_technique = knowledge_configuration.indexing_technique
+            if knowledge_configuration.indexing_technique == "high_quality":
+                model_manager = ModelManager()
+                embedding_model = model_manager.get_model_instance(
+                    tenant_id=current_user.current_tenant_id,  # ignore type error
+                    provider=knowledge_configuration.embedding_model_provider or "",
+                    model_type=ModelType.TEXT_EMBEDDING,
+                    model=knowledge_configuration.embedding_model or "",
+                )
+                dataset.embedding_model = embedding_model.model
+                dataset.embedding_model_provider = embedding_model.provider
+                dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
+                    embedding_model.provider, embedding_model.model
+                )
+                dataset.collection_binding_id = dataset_collection_binding.id
+            elif knowledge_configuration.indexing_technique == "economy":
+                dataset.keyword_number = knowledge_configuration.keyword_number
+            else:
+                raise ValueError("Invalid index method")
+            dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
+            session.add(dataset)
+        else:
+            if dataset.chunk_structure and dataset.chunk_structure != knowledge_configuration.chunk_structure:
+                raise ValueError("Chunk structure is not allowed to be updated.")
+            action = None
+            if dataset.indexing_technique != knowledge_configuration.indexing_technique:
+                # if update indexing_technique
+                if knowledge_configuration.indexing_technique == "economy":
+                    raise ValueError("Knowledge base indexing technique is not allowed to be updated to economy.")
+                elif knowledge_configuration.indexing_technique == "high_quality":
+                    action = "add"
+                    # get embedding model setting
+                    try:
+                        model_manager = ModelManager()
+                        embedding_model = model_manager.get_model_instance(
+                            tenant_id=current_user.current_tenant_id,
+                            provider=knowledge_configuration.embedding_model_provider,
+                            model_type=ModelType.TEXT_EMBEDDING,
+                            model=knowledge_configuration.embedding_model,
+                        )
+                        dataset.embedding_model = embedding_model.model
+                        dataset.embedding_model_provider = embedding_model.provider
+                        dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
+                            embedding_model.provider, embedding_model.model
+                        )
+                        dataset.collection_binding_id = dataset_collection_binding.id
+                        dataset.indexing_technique = knowledge_configuration.indexing_technique
+                    except LLMBadRequestError:
+                        raise ValueError(
+                            "No Embedding Model available. Please configure a valid provider "
+                            "in the Settings -> Model Provider."
+                        )
+                    except ProviderTokenNotInitError as ex:
+                        raise ValueError(ex.description)
+            else:
+                # add default plugin id to both setting sets, to make sure the plugin model provider is consistent
+                # Skip embedding model checks if not provided in the update request
+                if dataset.indexing_technique == "high_quality":
+                    skip_embedding_update = False
+                    try:
+                        # Handle existing model provider
+                        plugin_model_provider = dataset.embedding_model_provider
+                        plugin_model_provider_str = None
+                        if plugin_model_provider:
+                            plugin_model_provider_str = str(ModelProviderID(plugin_model_provider))
+
+                        # Handle new model provider from request
+                        new_plugin_model_provider = knowledge_configuration.embedding_model_provider
+                        new_plugin_model_provider_str = None
+                        if new_plugin_model_provider:
+                            new_plugin_model_provider_str = str(ModelProviderID(new_plugin_model_provider))
+
+                        # Only update embedding model if both values are provided and different from current
+                        if (
+                            plugin_model_provider_str != new_plugin_model_provider_str
+                            or knowledge_configuration.embedding_model != dataset.embedding_model
+                        ):
+                            action = "update"
+                            model_manager = ModelManager()
+                            embedding_model = None
+                            try:
+                                embedding_model = model_manager.get_model_instance(
+                                    tenant_id=current_user.current_tenant_id,
+                                    provider=knowledge_configuration.embedding_model_provider,
+                                    model_type=ModelType.TEXT_EMBEDDING,
+                                    model=knowledge_configuration.embedding_model,
+                                )
+                            except ProviderTokenNotInitError:
+                                # If we can't get the embedding model, skip updating it
+                                # and keep the existing settings if available
+                                # Skip the rest of the embedding model update
+                                skip_embedding_update = True
+                            if not skip_embedding_update:
+                                if embedding_model:
+                                    dataset.embedding_model = embedding_model.model
+                                    dataset.embedding_model_provider = embedding_model.provider
+                                    dataset_collection_binding = (
+                                        DatasetCollectionBindingService.get_dataset_collection_binding(
+                                            embedding_model.provider, embedding_model.model
+                                        )
+                                    )
+                                    dataset.collection_binding_id = dataset_collection_binding.id
+                    except LLMBadRequestError:
+                        raise ValueError(
+                            "No Embedding Model available. Please configure a valid provider "
+                            "in the Settings -> Model Provider."
+                        )
+                    except ProviderTokenNotInitError as ex:
+                        raise ValueError(ex.description)
+                elif dataset.indexing_technique == "economy":
+                    if dataset.keyword_number != knowledge_configuration.keyword_number:
+                        dataset.keyword_number = knowledge_configuration.keyword_number
+            dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
+            session.add(dataset)
+            session.commit()
+            if action:
+                deal_dataset_index_update_task.delay(dataset.id, action)
+
    @staticmethod
    def delete_dataset(dataset_id, user):
        dataset = DatasetService.get_dataset(dataset_id)
@ -730,6 +1024,18 @@ class DatasetService:
            .all()
        )

+    @staticmethod
+    def update_dataset_api_status(dataset_id: str, status: bool):
+        dataset = DatasetService.get_dataset(dataset_id)
+        if dataset is None:
+            raise NotFound("Dataset not found.")
+        dataset.enable_api = status
+        if not current_user or not current_user.id:
+            raise ValueError("Current user or current user id not found")
+        dataset.updated_by = current_user.id
+        dataset.updated_at = naive_utc_now()
+        db.session.commit()
+
    @staticmethod
    def get_dataset_auto_disable_logs(dataset_id: str):
        assert isinstance(current_user, Account)
@ -974,7 +1280,7 @@ class DocumentService:
            return
        documents = db.session.scalars(select(Document).where(Document.id.in_(document_ids))).all()
        file_ids = [
-            document.data_source_info_dict["upload_file_id"]
+            document.data_source_info_dict.get("upload_file_id", "")
            for document in documents
            if document.data_source_type == "upload_file" and document.data_source_info_dict
        ]
@ -1062,7 +1368,9 @@ class DocumentService:
            redis_client.setex(retry_indexing_cache_key, 600, 1)
        # trigger async task
        document_ids = [document.id for document in documents]
-        retry_document_indexing_task.delay(dataset_id, document_ids)
+        if not current_user or not current_user.id:
+            raise ValueError("Current user or current user id not found")
+        retry_document_indexing_task.delay(dataset_id, document_ids, current_user.id)

    @staticmethod
    def sync_website_document(dataset_id: str, document: Document):
@ -1211,7 +1519,7 @@ class DocumentService:
                        )
                        return [], ""
                    db.session.add(dataset_process_rule)
-                    db.session.commit()
+                    db.session.flush()
            lock_name = f"add_document_lock_dataset_id_{dataset.id}"
            with redis_client.lock(lock_name, timeout=600):
                position = DocumentService.get_documents_position(dataset.id)
@ -1301,23 +1609,10 @@ class DocumentService:
                            exist_document[data_source_info["notion_page_id"]] = document.id
                    for notion_info in notion_info_list:
                        workspace_id = notion_info.workspace_id
-                        data_source_binding = (
-                            db.session.query(DataSourceOauthBinding)
-                            .where(
-                                db.and_(
-                                    DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
-                                    DataSourceOauthBinding.provider == "notion",
-                                    DataSourceOauthBinding.disabled == False,
-                                    DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
-                                )
-                            )
-                            .first()
-                        )
-                        if not data_source_binding:
-                            raise ValueError("Data source binding not found.")
                        for page in notion_info.pages:
                            if page.page_id not in exist_page_ids:
                                data_source_info = {
+                                    "credential_id": notion_info.credential_id,
                                    "notion_workspace_id": workspace_id,
                                    "notion_page_id": page.page_id,
                                    "notion_page_icon": page.page_icon.model_dump() if page.page_icon else None,
@ -1393,6 +1688,283 @@ class DocumentService:

        return documents, batch

+    # @staticmethod
+    # def save_document_with_dataset_id(
+    #     dataset: Dataset,
+    #     knowledge_config: KnowledgeConfig,
+    #     account: Account | Any,
+    #     dataset_process_rule: Optional[DatasetProcessRule] = None,
+    #     created_from: str = "web",
+    # ):
+    #     # check document limit
+    #     features = FeatureService.get_features(current_user.current_tenant_id)
+
+    #     if features.billing.enabled:
+    #         if not knowledge_config.original_document_id:
+    #             count = 0
+    #             if knowledge_config.data_source:
+    #                 if knowledge_config.data_source.info_list.data_source_type == "upload_file":
+    #                     upload_file_list = knowledge_config.data_source.info_list.file_info_list.file_ids
+    # # type: ignore
+    #                     count = len(upload_file_list)
+    #                 elif knowledge_config.data_source.info_list.data_source_type == "notion_import":
+    #                     notion_info_list = knowledge_config.data_source.info_list.notion_info_list
+    #                     for notion_info in notion_info_list:  # type: ignore
+    #                         count = count + len(notion_info.pages)
+    #                 elif knowledge_config.data_source.info_list.data_source_type == "website_crawl":
+    #                     website_info = knowledge_config.data_source.info_list.website_info_list
+    #                     count = len(website_info.urls)  # type: ignore
+    #                 batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT)
+
+    #                 if features.billing.subscription.plan == "sandbox" and count > 1:
+    #                     raise ValueError("Your current plan does not support batch upload, please upgrade your plan.")
+    #                 if count > batch_upload_limit:
+    #                     raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
+
+    #                 DocumentService.check_documents_upload_quota(count, features)
+
+    #     # if dataset is empty, update dataset data_source_type
+    #     if not dataset.data_source_type:
+    #         dataset.data_source_type = knowledge_config.data_source.info_list.data_source_type  # type: ignore
+
+    #     if not dataset.indexing_technique:
+    #         if knowledge_config.indexing_technique not in Dataset.INDEXING_TECHNIQUE_LIST:
+    #             raise ValueError("Indexing technique is invalid")
+
+    #         dataset.indexing_technique = knowledge_config.indexing_technique
+    #         if knowledge_config.indexing_technique == "high_quality":
+    #             model_manager = ModelManager()
+    #             if knowledge_config.embedding_model and knowledge_config.embedding_model_provider:
+    #                 dataset_embedding_model = knowledge_config.embedding_model
+    #                 dataset_embedding_model_provider = knowledge_config.embedding_model_provider
+    #             else:
+    #                 embedding_model = model_manager.get_default_model_instance(
+    #                     tenant_id=current_user.current_tenant_id, model_type=ModelType.TEXT_EMBEDDING
+    #                 )
+    #                 dataset_embedding_model = embedding_model.model
+    #                 dataset_embedding_model_provider = embedding_model.provider
+    #             dataset.embedding_model = dataset_embedding_model
+    #             dataset.embedding_model_provider = dataset_embedding_model_provider
+    #             dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
+    #                 dataset_embedding_model_provider, dataset_embedding_model
+    #             )
+    #             dataset.collection_binding_id = dataset_collection_binding.id
+    #             if not dataset.retrieval_model:
+    #                 default_retrieval_model = {
+    #                     "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
+    #                     "reranking_enable": False,
+    #                     "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
+    #                     "top_k": 2,
+    #                     "score_threshold_enabled": False,
+    #                 }
+
+    #                 dataset.retrieval_model = (
+    #                     knowledge_config.retrieval_model.model_dump()
+    #                     if knowledge_config.retrieval_model
+    #                     else default_retrieval_model
+    #                 )  # type: ignore
+
+    #     documents = []
+    #     if knowledge_config.original_document_id:
+    #         document = DocumentService.update_document_with_dataset_id(dataset, knowledge_config, account)
+    #         documents.append(document)
+    #         batch = document.batch
+    #     else:
+    #         batch = time.strftime("%Y%m%d%H%M%S") + str(random.randint(100000, 999999))
+    #         # save process rule
+    #         if not dataset_process_rule:
+    #             process_rule = knowledge_config.process_rule
+    #             if process_rule:
+    #                 if process_rule.mode in ("custom", "hierarchical"):
+    #                     dataset_process_rule = DatasetProcessRule(
+    #                         dataset_id=dataset.id,
+    #                         mode=process_rule.mode,
+    #                         rules=process_rule.rules.model_dump_json() if process_rule.rules else None,
+    #                         created_by=account.id,
+    #                     )
+    #                 elif process_rule.mode == "automatic":
+    #                     dataset_process_rule = DatasetProcessRule(
+    #                         dataset_id=dataset.id,
+    #                         mode=process_rule.mode,
+    #                         rules=json.dumps(DatasetProcessRule.AUTOMATIC_RULES),
+    #                         created_by=account.id,
+    #                     )
+    #                 else:
+    #                     logging.warn(
+    #                         f"Invalid process rule mode: {process_rule.mode}, can not find dataset process rule"
+    #                     )
+    #                     return
+    #                 db.session.add(dataset_process_rule)
+    #                 db.session.commit()
+    #         lock_name = "add_document_lock_dataset_id_{}".format(dataset.id)
+    #         with redis_client.lock(lock_name, timeout=600):
+    #             position = DocumentService.get_documents_position(dataset.id)
+    #             document_ids = []
+    #             duplicate_document_ids = []
+    #             if knowledge_config.data_source.info_list.data_source_type == "upload_file":  # type: ignore
+    #                 upload_file_list = knowledge_config.data_source.info_list.file_info_list.file_ids  # type: ignore
+    #                 for file_id in upload_file_list:
+    #                     file = (
+    #                         db.session.query(UploadFile)
+    #                         .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
+    #                         .first()
+    #                     )
+
+    #                     # raise error if file not found
+    #                     if not file:
+    #                         raise FileNotExistsError()
+
+    #                     file_name = file.name
+    #                     data_source_info = {
+    #                         "upload_file_id": file_id,
+    #                     }
+    #                     # check duplicate
+    #                     if knowledge_config.duplicate:
+    #                         document = Document.query.filter_by(
+    #                             dataset_id=dataset.id,
+    #                             tenant_id=current_user.current_tenant_id,
+    #                             data_source_type="upload_file",
+    #                             enabled=True,
+    #                             name=file_name,
+    #                         ).first()
+    #                         if document:
+    #                             document.dataset_process_rule_id = dataset_process_rule.id  # type: ignore
+    #                             document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+    #                             document.created_from = created_from
+    #                             document.doc_form = knowledge_config.doc_form
+    #                             document.doc_language = knowledge_config.doc_language
+    #                             document.data_source_info = json.dumps(data_source_info)
+    #                             document.batch = batch
+    #                             document.indexing_status = "waiting"
+    #                             db.session.add(document)
+    #                             documents.append(document)
+    #                             duplicate_document_ids.append(document.id)
+    #                             continue
+    #                     document = DocumentService.build_document(
+    #                         dataset,
+    #                         dataset_process_rule.id,  # type: ignore
+    #                         knowledge_config.data_source.info_list.data_source_type,  # type: ignore
+    #                         knowledge_config.doc_form,
+    #                         knowledge_config.doc_language,
+    #                         data_source_info,
+    #                         created_from,
+    #                         position,
+    #                         account,
+    #                         file_name,
+    #                         batch,
+    #                     )
+    #                     db.session.add(document)
+    #                     db.session.flush()
+    #                     document_ids.append(document.id)
+    #                     documents.append(document)
+    #                     position += 1
+    #             elif knowledge_config.data_source.info_list.data_source_type == "notion_import":  # type: ignore
+    #                 notion_info_list = knowledge_config.data_source.info_list.notion_info_list  # type: ignore
+    #                 if not notion_info_list:
+    #                     raise ValueError("No notion info list found.")
+    #                 exist_page_ids = []
+    #                 exist_document = {}
+    #                 documents = Document.query.filter_by(
+    #                     dataset_id=dataset.id,
+    #                     tenant_id=current_user.current_tenant_id,
+    #                     data_source_type="notion_import",
+    #                     enabled=True,
+    #                 ).all()
+    #                 if documents:
+    #                     for document in documents:
+    #                         data_source_info = json.loads(document.data_source_info)
+    #                         exist_page_ids.append(data_source_info["notion_page_id"])
+    #                         exist_document[data_source_info["notion_page_id"]] = document.id
+    #                 for notion_info in notion_info_list:
+    #                     workspace_id = notion_info.workspace_id
+    #                     data_source_binding = DataSourceOauthBinding.query.filter(
+    #                         db.and_(
+    #                             DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
+    #                             DataSourceOauthBinding.provider == "notion",
+    #                             DataSourceOauthBinding.disabled == False,
+    #                             DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
+    #                         )
+    #                     ).first()
+    #                     if not data_source_binding:
+    #                         raise ValueError("Data source binding not found.")
+    #                     for page in notion_info.pages:
+    #                         if page.page_id not in exist_page_ids:
+    #                             data_source_info = {
+    #                                 "notion_workspace_id": workspace_id,
+    #                                 "notion_page_id": page.page_id,
+    #                                 "notion_page_icon": page.page_icon.model_dump() if page.page_icon else None,
+    #                                 "type": page.type,
+    #                             }
+    #                             # Truncate page name to 255 characters to prevent DB field length errors
+    #                             truncated_page_name = page.page_name[:255] if page.page_name else "nopagename"
+    #                             document = DocumentService.build_document(
+    #                                 dataset,
+    #                                 dataset_process_rule.id,  # type: ignore
+    #                                 knowledge_config.data_source.info_list.data_source_type,  # type: ignore
+    #                                 knowledge_config.doc_form,
+    #                                 knowledge_config.doc_language,
+    #                                 data_source_info,
+    #                                 created_from,
+    #                                 position,
+    #                                 account,
+    #                                 truncated_page_name,
+    #                                 batch,
+    #                             )
+    #                             db.session.add(document)
+    #                             db.session.flush()
+    #                             document_ids.append(document.id)
+    #                             documents.append(document)
+    #                             position += 1
+    #                         else:
+    #                             exist_document.pop(page.page_id)
+    #                 # delete not selected documents
+    #                 if len(exist_document) > 0:
+    #                     clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
+    #             elif knowledge_config.data_source.info_list.data_source_type == "website_crawl":  # type: ignore
+    #                 website_info = knowledge_config.data_source.info_list.website_info_list  # type: ignore
+    #                 if not website_info:
+    #                     raise ValueError("No website info list found.")
+    #                 urls = website_info.urls
+    #                 for url in urls:
+    #                     data_source_info = {
+    #                         "url": url,
+    #                         "provider": website_info.provider,
+    #                         "job_id": website_info.job_id,
+    #                         "only_main_content": website_info.only_main_content,
+    #                         "mode": "crawl",
+    #                     }
+    #                     if len(url) > 255:
+    #                         document_name = url[:200] + "..."
+    #                     else:
+    #                         document_name = url
+    #                     document = DocumentService.build_document(
+    #                         dataset,
+    #                         dataset_process_rule.id,  # type: ignore
+    #                         knowledge_config.data_source.info_list.data_source_type,  # type: ignore
+    #                         knowledge_config.doc_form,
+    #                         knowledge_config.doc_language,
+    #                         data_source_info,
+    #                         created_from,
+    #                         position,
+    #                         account,
+    #                         document_name,
+    #                         batch,
+    #                     )
+    #                     db.session.add(document)
+    #                     db.session.flush()
+    #                     document_ids.append(document.id)
+    #                     documents.append(document)
+    #                     position += 1
+    #             db.session.commit()
+
+    #             # trigger async task
+    #             if document_ids:
+    #                 document_indexing_task.delay(dataset.id, document_ids)
+    #             if duplicate_document_ids:
+    #                 duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
+
+    #     return documents, batch
+
    @staticmethod
    def check_documents_upload_quota(count: int, features: FeatureModel):
        can_upload_size = features.documents_upload_quota.limit - features.documents_upload_quota.size
@ -1404,7 +1976,7 @@ class DocumentService:
    @staticmethod
    def build_document(
        dataset: Dataset,
-        process_rule_id: str,
+        process_rule_id: str | None,
        data_source_type: str,
        document_form: str,
        document_language: str,
@ -1540,6 +2112,7 @@ class DocumentService:
                        raise ValueError("Data source binding not found.")
                    for page in notion_info.pages:
                        data_source_info = {
+                            "credential_id": notion_info.credential_id,
                            "notion_workspace_id": workspace_id,
                            "notion_page_id": page.page_id,
                            "notion_page_icon": page.page_icon.model_dump() if page.page_icon else None,  # type: ignore
@ -2352,6 +2925,8 @@ class SegmentService:
            segment.error = str(e)
            db.session.commit()
        new_segment = db.session.query(DocumentSegment).where(DocumentSegment.id == segment.id).first()
+        if not new_segment:
+            raise ValueError("new_segment is not found")
        return new_segment

    @classmethod
@ -2430,9 +3005,11 @@ class SegmentService:
        if index_node_ids or child_node_ids:
            delete_segment_from_index_task.delay(index_node_ids, dataset.id, document.id, child_node_ids)

-        document.word_count = (
-            document.word_count - total_words if document.word_count and document.word_count > total_words else 0
-        )
+        if document.word_count is None:
+            document.word_count = 0
+        else:
+            document.word_count = max(0, document.word_count - total_words)
+
        db.session.add(document)

        # Delete database records
--- a/api/services/datasource_provider_service.py
+++ b/api/services/datasource_provider_service.py
@ -0,0 +1,975 @@
+import logging
+import time
+from collections.abc import Mapping
+from typing import Any
+
+from flask_login import current_user
+from sqlalchemy.orm import Session
+
+from configs import dify_config
+from constants import HIDDEN_VALUE, UNKNOWN_VALUE
+from core.helper import encrypter
+from core.helper.name_generator import generate_incremental_name
+from core.helper.provider_cache import NoOpProviderCredentialCache
+from core.model_runtime.entities.provider_entities import FormType
+from core.plugin.impl.datasource import PluginDatasourceManager
+from core.plugin.impl.oauth import OAuthHandler
+from core.tools.entities.tool_entities import CredentialType
+from core.tools.utils.encryption import ProviderConfigCache, ProviderConfigEncrypter, create_provider_encrypter
+from extensions.ext_database import db
+from extensions.ext_redis import redis_client
+from models.oauth import DatasourceOauthParamConfig, DatasourceOauthTenantParamConfig, DatasourceProvider
+from models.provider_ids import DatasourceProviderID
+from services.plugin.plugin_service import PluginService
+
+logger = logging.getLogger(__name__)
+
+
+class DatasourceProviderService:
+    """
+    Model Provider Service
+    """
+
+    def __init__(self) -> None:
+        self.provider_manager = PluginDatasourceManager()
+
+    def remove_oauth_custom_client_params(self, tenant_id: str, datasource_provider_id: DatasourceProviderID):
+        """
+        remove oauth custom client params
+        """
+        with Session(db.engine) as session:
+            session.query(DatasourceOauthTenantParamConfig).filter_by(
+                tenant_id=tenant_id,
+                provider=datasource_provider_id.provider_name,
+                plugin_id=datasource_provider_id.plugin_id,
+            ).delete()
+            session.commit()
+
+    def decrypt_datasource_provider_credentials(
+        self,
+        tenant_id: str,
+        datasource_provider: DatasourceProvider,
+        plugin_id: str,
+        provider: str,
+    ) -> dict[str, Any]:
+        encrypted_credentials = datasource_provider.encrypted_credentials
+        credential_secret_variables = self.extract_secret_variables(
+            tenant_id=tenant_id,
+            provider_id=f"{plugin_id}/{provider}",
+            credential_type=CredentialType.of(datasource_provider.auth_type),
+        )
+        decrypted_credentials = encrypted_credentials.copy()
+        for key, value in decrypted_credentials.items():
+            if key in credential_secret_variables:
+                decrypted_credentials[key] = encrypter.decrypt_token(tenant_id, value)
+        return decrypted_credentials
+
+    def encrypt_datasource_provider_credentials(
+        self,
+        tenant_id: str,
+        provider: str,
+        plugin_id: str,
+        raw_credentials: Mapping[str, Any],
+        datasource_provider: DatasourceProvider,
+    ) -> dict[str, Any]:
+        provider_credential_secret_variables = self.extract_secret_variables(
+            tenant_id=tenant_id,
+            provider_id=f"{plugin_id}/{provider}",
+            credential_type=CredentialType.of(datasource_provider.auth_type),
+        )
+        encrypted_credentials = dict(raw_credentials)
+        for key, value in encrypted_credentials.items():
+            if key in provider_credential_secret_variables:
+                encrypted_credentials[key] = encrypter.encrypt_token(tenant_id, value)
+        return encrypted_credentials
+
+    def get_datasource_credentials(
+        self,
+        tenant_id: str,
+        provider: str,
+        plugin_id: str,
+        credential_id: str | None = None,
+    ) -> dict[str, Any]:
+        """
+        get credential by id
+        """
+        with Session(db.engine) as session:
+            if credential_id:
+                datasource_provider = (
+                    session.query(DatasourceProvider).filter_by(tenant_id=tenant_id, id=credential_id).first()
+                )
+            else:
+                datasource_provider = (
+                    session.query(DatasourceProvider)
+                    .filter_by(tenant_id=tenant_id, provider=provider, plugin_id=plugin_id)
+                    .order_by(DatasourceProvider.is_default.desc(), DatasourceProvider.created_at.asc())
+                    .first()
+                )
+            if not datasource_provider:
+                return {}
+            # refresh the credentials
+            if datasource_provider.expires_at != -1 and (datasource_provider.expires_at - 60) < int(time.time()):
+                decrypted_credentials = self.decrypt_datasource_provider_credentials(
+                    tenant_id=tenant_id,
+                    datasource_provider=datasource_provider,
+                    plugin_id=plugin_id,
+                    provider=provider,
+                )
+                datasource_provider_id = DatasourceProviderID(f"{plugin_id}/{provider}")
+                provider_name = datasource_provider_id.provider_name
+                redirect_uri = (
+                    f"{dify_config.CONSOLE_API_URL}/console/api/oauth/plugin/"
+                    f"{datasource_provider_id}/datasource/callback"
+                )
+                system_credentials = self.get_oauth_client(tenant_id, datasource_provider_id)
+                refreshed_credentials = OAuthHandler().refresh_credentials(
+                    tenant_id=tenant_id,
+                    user_id=current_user.id,
+                    plugin_id=datasource_provider_id.plugin_id,
+                    provider=provider_name,
+                    redirect_uri=redirect_uri,
+                    system_credentials=system_credentials or {},
+                    credentials=decrypted_credentials,
+                )
+                datasource_provider.encrypted_credentials = self.encrypt_datasource_provider_credentials(
+                    tenant_id=tenant_id,
+                    raw_credentials=refreshed_credentials.credentials,
+                    provider=provider,
+                    plugin_id=plugin_id,
+                    datasource_provider=datasource_provider,
+                )
+                datasource_provider.expires_at = refreshed_credentials.expires_at
+                session.commit()
+
+            return self.decrypt_datasource_provider_credentials(
+                tenant_id=tenant_id,
+                datasource_provider=datasource_provider,
+                plugin_id=plugin_id,
+                provider=provider,
+            )
+
+    def get_all_datasource_credentials_by_provider(
+        self,
+        tenant_id: str,
+        provider: str,
+        plugin_id: str,
+    ) -> list[dict[str, Any]]:
+        """
+        get all datasource credentials by provider
+        """
+        with Session(db.engine) as session:
+            datasource_providers = (
+                session.query(DatasourceProvider)
+                .filter_by(tenant_id=tenant_id, provider=provider, plugin_id=plugin_id)
+                .order_by(DatasourceProvider.is_default.desc(), DatasourceProvider.created_at.asc())
+                .all()
+            )
+            if not datasource_providers:
+                return []
+            # refresh the credentials
+            real_credentials_list = []
+            for datasource_provider in datasource_providers:
+                decrypted_credentials = self.decrypt_datasource_provider_credentials(
+                    tenant_id=tenant_id,
+                    datasource_provider=datasource_provider,
+                    plugin_id=plugin_id,
+                    provider=provider,
+                )
+                datasource_provider_id = DatasourceProviderID(f"{plugin_id}/{provider}")
+                provider_name = datasource_provider_id.provider_name
+                redirect_uri = (
+                    f"{dify_config.CONSOLE_API_URL}/console/api/oauth/plugin/"
+                    f"{datasource_provider_id}/datasource/callback"
+                )
+                system_credentials = self.get_oauth_client(tenant_id, datasource_provider_id)
+                refreshed_credentials = OAuthHandler().refresh_credentials(
+                    tenant_id=tenant_id,
+                    user_id=current_user.id,
+                    plugin_id=datasource_provider_id.plugin_id,
+                    provider=provider_name,
+                    redirect_uri=redirect_uri,
+                    system_credentials=system_credentials or {},
+                    credentials=decrypted_credentials,
+                )
+                datasource_provider.encrypted_credentials = self.encrypt_datasource_provider_credentials(
+                    tenant_id=tenant_id,
+                    raw_credentials=refreshed_credentials.credentials,
+                    provider=provider,
+                    plugin_id=plugin_id,
+                    datasource_provider=datasource_provider,
+                )
+                datasource_provider.expires_at = refreshed_credentials.expires_at
+                real_credentials = self.decrypt_datasource_provider_credentials(
+                    tenant_id=tenant_id,
+                    datasource_provider=datasource_provider,
+                    plugin_id=plugin_id,
+                    provider=provider,
+                )
+                real_credentials_list.append(real_credentials)
+            session.commit()
+
+            return real_credentials_list
+
+    def update_datasource_provider_name(
+        self, tenant_id: str, datasource_provider_id: DatasourceProviderID, name: str, credential_id: str
+    ):
+        """
+        update datasource provider name
+        """
+        with Session(db.engine) as session:
+            target_provider = (
+                session.query(DatasourceProvider)
+                .filter_by(
+                    tenant_id=tenant_id,
+                    id=credential_id,
+                    provider=datasource_provider_id.provider_name,
+                    plugin_id=datasource_provider_id.plugin_id,
+                )
+                .first()
+            )
+            if target_provider is None:
+                raise ValueError("provider not found")
+
+            if target_provider.name == name:
+                return
+
+            # check name is exist
+            if (
+                session.query(DatasourceProvider)
+                .filter_by(
+                    tenant_id=tenant_id,
+                    name=name,
+                    provider=datasource_provider_id.provider_name,
+                    plugin_id=datasource_provider_id.plugin_id,
+                )
+                .count()
+                > 0
+            ):
+                raise ValueError("Authorization name is already exists")
+
+            target_provider.name = name
+            session.commit()
+        return
+
+    def set_default_datasource_provider(
+        self, tenant_id: str, datasource_provider_id: DatasourceProviderID, credential_id: str
+    ):
+        """
+        set default datasource provider
+        """
+        with Session(db.engine) as session:
+            # get provider
+            target_provider = (
+                session.query(DatasourceProvider)
+                .filter_by(
+                    tenant_id=tenant_id,
+                    id=credential_id,
+                    provider=datasource_provider_id.provider_name,
+                    plugin_id=datasource_provider_id.plugin_id,
+                )
+                .first()
+            )
+            if target_provider is None:
+                raise ValueError("provider not found")
+
+            # clear default provider
+            session.query(DatasourceProvider).filter_by(
+                tenant_id=tenant_id,
+                provider=target_provider.provider,
+                plugin_id=target_provider.plugin_id,
+                is_default=True,
+            ).update({"is_default": False})
+
+            # set new default provider
+            target_provider.is_default = True
+            session.commit()
+        return {"result": "success"}
+
+    def setup_oauth_custom_client_params(
+        self,
+        tenant_id: str,
+        datasource_provider_id: DatasourceProviderID,
+        client_params: dict | None,
+        enabled: bool | None,
+    ):
+        """
+        setup oauth custom client params
+        """
+        if client_params is None and enabled is None:
+            return
+        with Session(db.engine) as session:
+            tenant_oauth_client_params = (
+                session.query(DatasourceOauthTenantParamConfig)
+                .filter_by(
+                    tenant_id=tenant_id,
+                    provider=datasource_provider_id.provider_name,
+                    plugin_id=datasource_provider_id.plugin_id,
+                )
+                .first()
+            )
+
+            if not tenant_oauth_client_params:
+                tenant_oauth_client_params = DatasourceOauthTenantParamConfig(
+                    tenant_id=tenant_id,
+                    provider=datasource_provider_id.provider_name,
+                    plugin_id=datasource_provider_id.plugin_id,
+                    client_params={},
+                    enabled=False,
+                )
+                session.add(tenant_oauth_client_params)
+
+            if client_params is not None:
+                encrypter, _ = self.get_oauth_encrypter(tenant_id, datasource_provider_id)
+                original_params = (
+                    encrypter.decrypt(tenant_oauth_client_params.client_params) if tenant_oauth_client_params else {}
+                )
+                new_params: dict = {
+                    key: value if value != HIDDEN_VALUE else original_params.get(key, UNKNOWN_VALUE)
+                    for key, value in client_params.items()
+                }
+                tenant_oauth_client_params.client_params = encrypter.encrypt(new_params)
+
+            if enabled is not None:
+                tenant_oauth_client_params.enabled = enabled
+            session.commit()
+
+    def is_system_oauth_params_exist(self, datasource_provider_id: DatasourceProviderID) -> bool:
+        """
+        check if system oauth params exist
+        """
+        with Session(db.engine).no_autoflush as session:
+            return (
+                session.query(DatasourceOauthParamConfig)
+                .filter_by(provider=datasource_provider_id.provider_name, plugin_id=datasource_provider_id.plugin_id)
+                .first()
+                is not None
+            )
+
+    def is_tenant_oauth_params_enabled(self, tenant_id: str, datasource_provider_id: DatasourceProviderID) -> bool:
+        """
+        check if tenant oauth params is enabled
+        """
+        return (
+            db.session.query(DatasourceOauthTenantParamConfig)
+            .filter_by(
+                tenant_id=tenant_id,
+                provider=datasource_provider_id.provider_name,
+                plugin_id=datasource_provider_id.plugin_id,
+                enabled=True,
+            )
+            .count()
+            > 0
+        )
+
+    def get_tenant_oauth_client(
+        self, tenant_id: str, datasource_provider_id: DatasourceProviderID, mask: bool = False
+    ) -> dict[str, Any] | None:
+        """
+        get tenant oauth client
+        """
+        tenant_oauth_client_params = (
+            db.session.query(DatasourceOauthTenantParamConfig)
+            .filter_by(
+                tenant_id=tenant_id,
+                provider=datasource_provider_id.provider_name,
+                plugin_id=datasource_provider_id.plugin_id,
+            )
+            .first()
+        )
+        if tenant_oauth_client_params:
+            encrypter, _ = self.get_oauth_encrypter(tenant_id, datasource_provider_id)
+            if mask:
+                return encrypter.mask_tool_credentials(encrypter.decrypt(tenant_oauth_client_params.client_params))
+            else:
+                return encrypter.decrypt(tenant_oauth_client_params.client_params)
+        return None
+
+    def get_oauth_encrypter(
+        self, tenant_id: str, datasource_provider_id: DatasourceProviderID
+    ) -> tuple[ProviderConfigEncrypter, ProviderConfigCache]:
+        """
+        get oauth encrypter
+        """
+        datasource_provider = self.provider_manager.fetch_datasource_provider(
+            tenant_id=tenant_id, provider_id=str(datasource_provider_id)
+        )
+        if not datasource_provider.declaration.oauth_schema:
+            raise ValueError("Datasource provider oauth schema not found")
+
+        client_schema = datasource_provider.declaration.oauth_schema.client_schema
+        return create_provider_encrypter(
+            tenant_id=tenant_id,
+            config=[x.to_basic_provider_config() for x in client_schema],
+            cache=NoOpProviderCredentialCache(),
+        )
+
+    def get_oauth_client(self, tenant_id: str, datasource_provider_id: DatasourceProviderID) -> dict[str, Any] | None:
+        """
+        get oauth client
+        """
+        provider = datasource_provider_id.provider_name
+        plugin_id = datasource_provider_id.plugin_id
+        with Session(db.engine).no_autoflush as session:
+            # get tenant oauth client params
+            tenant_oauth_client_params = (
+                session.query(DatasourceOauthTenantParamConfig)
+                .filter_by(
+                    tenant_id=tenant_id,
+                    provider=provider,
+                    plugin_id=plugin_id,
+                    enabled=True,
+                )
+                .first()
+            )
+            if tenant_oauth_client_params:
+                encrypter, _ = self.get_oauth_encrypter(tenant_id, datasource_provider_id)
+                return encrypter.decrypt(tenant_oauth_client_params.client_params)
+
+            provider_controller = self.provider_manager.fetch_datasource_provider(
+                tenant_id=tenant_id, provider_id=str(datasource_provider_id)
+            )
+            is_verified = PluginService.is_plugin_verified(tenant_id, provider_controller.plugin_unique_identifier)
+            if is_verified:
+                # fallback to system oauth client params
+                oauth_client_params = (
+                    session.query(DatasourceOauthParamConfig).filter_by(provider=provider, plugin_id=plugin_id).first()
+                )
+                if oauth_client_params:
+                    return oauth_client_params.system_credentials
+
+            raise ValueError(f"Please configure oauth client params(system/tenant) for {plugin_id}/{provider}")
+
+    @staticmethod
+    def generate_next_datasource_provider_name(
+        session: Session, tenant_id: str, provider_id: DatasourceProviderID, credential_type: CredentialType
+    ) -> str:
+        db_providers = (
+            session.query(DatasourceProvider)
+            .filter_by(
+                tenant_id=tenant_id,
+                provider=provider_id.provider_name,
+                plugin_id=provider_id.plugin_id,
+            )
+            .all()
+        )
+        return generate_incremental_name(
+            [provider.name for provider in db_providers],
+            f"{credential_type.get_name()}",
+        )
+
+    def reauthorize_datasource_oauth_provider(
+        self,
+        name: str | None,
+        tenant_id: str,
+        provider_id: DatasourceProviderID,
+        avatar_url: str | None,
+        expire_at: int,
+        credentials: dict,
+        credential_id: str,
+    ) -> None:
+        """
+        update datasource oauth provider
+        """
+        with Session(db.engine) as session:
+            lock = f"datasource_provider_create_lock:{tenant_id}_{provider_id}_{CredentialType.OAUTH2.value}"
+            with redis_client.lock(lock, timeout=20):
+                target_provider = (
+                    session.query(DatasourceProvider).filter_by(id=credential_id, tenant_id=tenant_id).first()
+                )
+                if target_provider is None:
+                    raise ValueError("provider not found")
+
+                db_provider_name = name
+                if not db_provider_name:
+                    db_provider_name = target_provider.name
+                else:
+                    name_conflict = (
+                        session.query(DatasourceProvider)
+                        .filter_by(
+                            tenant_id=tenant_id,
+                            name=db_provider_name,
+                            provider=provider_id.provider_name,
+                            plugin_id=provider_id.plugin_id,
+                            auth_type=CredentialType.OAUTH2.value,
+                        )
+                        .count()
+                    )
+                    if name_conflict > 0:
+                        db_provider_name = generate_incremental_name(
+                            [
+                                provider.name
+                                for provider in session.query(DatasourceProvider).filter_by(
+                                    tenant_id=tenant_id,
+                                    provider=provider_id.provider_name,
+                                    plugin_id=provider_id.plugin_id,
+                                )
+                            ],
+                            db_provider_name,
+                        )
+
+                provider_credential_secret_variables = self.extract_secret_variables(
+                    tenant_id=tenant_id, provider_id=f"{provider_id}", credential_type=CredentialType.OAUTH2
+                )
+                for key, value in credentials.items():
+                    if key in provider_credential_secret_variables:
+                        credentials[key] = encrypter.encrypt_token(tenant_id, value)
+
+                target_provider.expires_at = expire_at
+                target_provider.encrypted_credentials = credentials
+                target_provider.avatar_url = avatar_url or target_provider.avatar_url
+                session.commit()
+
+    def add_datasource_oauth_provider(
+        self,
+        name: str | None,
+        tenant_id: str,
+        provider_id: DatasourceProviderID,
+        avatar_url: str | None,
+        expire_at: int,
+        credentials: dict,
+    ) -> None:
+        """
+        add datasource oauth provider
+        """
+        credential_type = CredentialType.OAUTH2
+        with Session(db.engine) as session:
+            lock = f"datasource_provider_create_lock:{tenant_id}_{provider_id}_{credential_type.value}"
+            with redis_client.lock(lock, timeout=60):
+                db_provider_name = name
+                if not db_provider_name:
+                    db_provider_name = self.generate_next_datasource_provider_name(
+                        session=session,
+                        tenant_id=tenant_id,
+                        provider_id=provider_id,
+                        credential_type=credential_type,
+                    )
+                else:
+                    if (
+                        session.query(DatasourceProvider)
+                        .filter_by(
+                            tenant_id=tenant_id,
+                            name=db_provider_name,
+                            provider=provider_id.provider_name,
+                            plugin_id=provider_id.plugin_id,
+                            auth_type=credential_type.value,
+                        )
+                        .count()
+                        > 0
+                    ):
+                        db_provider_name = generate_incremental_name(
+                            [
+                                provider.name
+                                for provider in session.query(DatasourceProvider).filter_by(
+                                    tenant_id=tenant_id,
+                                    provider=provider_id.provider_name,
+                                    plugin_id=provider_id.plugin_id,
+                                )
+                            ],
+                            db_provider_name,
+                        )
+
+                provider_credential_secret_variables = self.extract_secret_variables(
+                    tenant_id=tenant_id, provider_id=f"{provider_id}", credential_type=credential_type
+                )
+                for key, value in credentials.items():
+                    if key in provider_credential_secret_variables:
+                        credentials[key] = encrypter.encrypt_token(tenant_id, value)
+
+                datasource_provider = DatasourceProvider(
+                    tenant_id=tenant_id,
+                    name=db_provider_name,
+                    provider=provider_id.provider_name,
+                    plugin_id=provider_id.plugin_id,
+                    auth_type=credential_type.value,
+                    encrypted_credentials=credentials,
+                    avatar_url=avatar_url or "default",
+                    expires_at=expire_at,
+                )
+                session.add(datasource_provider)
+                session.commit()
+
+    def add_datasource_api_key_provider(
+        self,
+        name: str | None,
+        tenant_id: str,
+        provider_id: DatasourceProviderID,
+        credentials: dict,
+    ) -> None:
+        """
+        validate datasource provider credentials.
+
+        :param tenant_id:
+        :param provider:
+        :param credentials:
+        """
+        provider_name = provider_id.provider_name
+        plugin_id = provider_id.plugin_id
+        with Session(db.engine) as session:
+            lock = f"datasource_provider_create_lock:{tenant_id}_{provider_id}_{CredentialType.API_KEY}"
+            with redis_client.lock(lock, timeout=20):
+                db_provider_name = name or self.generate_next_datasource_provider_name(
+                    session=session,
+                    tenant_id=tenant_id,
+                    provider_id=provider_id,
+                    credential_type=CredentialType.API_KEY,
+                )
+
+                # check name is exist
+                if (
+                    session.query(DatasourceProvider)
+                    .filter_by(tenant_id=tenant_id, plugin_id=plugin_id, provider=provider_name, name=db_provider_name)
+                    .count()
+                    > 0
+                ):
+                    raise ValueError("Authorization name is already exists")
+
+                try:
+                    self.provider_manager.validate_provider_credentials(
+                        tenant_id=tenant_id,
+                        user_id=current_user.id,
+                        provider=provider_name,
+                        plugin_id=plugin_id,
+                        credentials=credentials,
+                    )
+                except Exception as e:
+                    raise ValueError(f"Failed to validate credentials: {str(e)}")
+
+                provider_credential_secret_variables = self.extract_secret_variables(
+                    tenant_id=tenant_id, provider_id=f"{provider_id}", credential_type=CredentialType.API_KEY
+                )
+                for key, value in credentials.items():
+                    if key in provider_credential_secret_variables:
+                        # if send [__HIDDEN__] in secret input, it will be same as original value
+                        credentials[key] = encrypter.encrypt_token(tenant_id, value)
+                datasource_provider = DatasourceProvider(
+                    tenant_id=tenant_id,
+                    name=db_provider_name,
+                    provider=provider_name,
+                    plugin_id=plugin_id,
+                    auth_type=CredentialType.API_KEY.value,
+                    encrypted_credentials=credentials,
+                )
+                session.add(datasource_provider)
+                session.commit()
+
+    def extract_secret_variables(self, tenant_id: str, provider_id: str, credential_type: CredentialType) -> list[str]:
+        """
+        Extract secret input form variables.
+
+        :param credential_form_schemas:
+        :return:
+        """
+        datasource_provider = self.provider_manager.fetch_datasource_provider(
+            tenant_id=tenant_id, provider_id=provider_id
+        )
+        credential_form_schemas = []
+        if credential_type == CredentialType.API_KEY:
+            credential_form_schemas = list(datasource_provider.declaration.credentials_schema)
+        elif credential_type == CredentialType.OAUTH2:
+            if not datasource_provider.declaration.oauth_schema:
+                raise ValueError("Datasource provider oauth schema not found")
+            credential_form_schemas = list(datasource_provider.declaration.oauth_schema.credentials_schema)
+        else:
+            raise ValueError(f"Invalid credential type: {credential_type}")
+
+        secret_input_form_variables = []
+        for credential_form_schema in credential_form_schemas:
+            if credential_form_schema.type.value == FormType.SECRET_INPUT.value:
+                secret_input_form_variables.append(credential_form_schema.name)
+
+        return secret_input_form_variables
+
+    def list_datasource_credentials(self, tenant_id: str, provider: str, plugin_id: str) -> list[dict]:
+        """
+        list datasource credentials with obfuscated sensitive fields.
+
+        :param tenant_id: workspace id
+        :param provider_id: provider id
+        :return:
+        """
+        # Get all provider configurations of the current workspace
+        datasource_providers: list[DatasourceProvider] = (
+            db.session.query(DatasourceProvider)
+            .where(
+                DatasourceProvider.tenant_id == tenant_id,
+                DatasourceProvider.provider == provider,
+                DatasourceProvider.plugin_id == plugin_id,
+            )
+            .all()
+        )
+        if not datasource_providers:
+            return []
+        copy_credentials_list = []
+        default_provider = (
+            db.session.query(DatasourceProvider.id)
+            .filter_by(tenant_id=tenant_id, provider=provider, plugin_id=plugin_id)
+            .order_by(DatasourceProvider.is_default.desc(), DatasourceProvider.created_at.asc())
+            .first()
+        )
+        default_provider_id = default_provider.id if default_provider else None
+        for datasource_provider in datasource_providers:
+            encrypted_credentials = datasource_provider.encrypted_credentials
+            # Get provider credential secret variables
+            credential_secret_variables = self.extract_secret_variables(
+                tenant_id=tenant_id,
+                provider_id=f"{plugin_id}/{provider}",
+                credential_type=CredentialType.of(datasource_provider.auth_type),
+            )
+
+            # Obfuscate provider credentials
+            copy_credentials = encrypted_credentials.copy()
+            for key, value in copy_credentials.items():
+                if key in credential_secret_variables:
+                    copy_credentials[key] = encrypter.obfuscated_token(value)
+            copy_credentials_list.append(
+                {
+                    "credential": copy_credentials,
+                    "type": datasource_provider.auth_type,
+                    "name": datasource_provider.name,
+                    "avatar_url": datasource_provider.avatar_url,
+                    "id": datasource_provider.id,
+                    "is_default": default_provider_id and datasource_provider.id == default_provider_id,
+                }
+            )
+
+        return copy_credentials_list
+
+    def get_all_datasource_credentials(self, tenant_id: str) -> list[dict]:
+        """
+        get datasource credentials.
+
+        :return:
+        """
+        # get all plugin providers
+        manager = PluginDatasourceManager()
+        datasources = manager.fetch_installed_datasource_providers(tenant_id)
+        datasource_credentials = []
+        for datasource in datasources:
+            datasource_provider_id = DatasourceProviderID(f"{datasource.plugin_id}/{datasource.provider}")
+            credentials = self.list_datasource_credentials(
+                tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
+            )
+            redirect_uri = (
+                f"{dify_config.CONSOLE_API_URL}/console/api/oauth/plugin/{datasource_provider_id}/datasource/callback"
+            )
+            datasource_credentials.append(
+                {
+                    "provider": datasource.provider,
+                    "plugin_id": datasource.plugin_id,
+                    "plugin_unique_identifier": datasource.plugin_unique_identifier,
+                    "icon": datasource.declaration.identity.icon,
+                    "name": datasource.declaration.identity.name.split("/")[-1],
+                    "label": datasource.declaration.identity.label.model_dump(),
+                    "description": datasource.declaration.identity.description.model_dump(),
+                    "author": datasource.declaration.identity.author,
+                    "credentials_list": credentials,
+                    "credential_schema": [
+                        credential.model_dump() for credential in datasource.declaration.credentials_schema
+                    ],
+                    "oauth_schema": {
+                        "client_schema": [
+                            client_schema.model_dump()
+                            for client_schema in datasource.declaration.oauth_schema.client_schema
+                        ],
+                        "credentials_schema": [
+                            credential_schema.model_dump()
+                            for credential_schema in datasource.declaration.oauth_schema.credentials_schema
+                        ],
+                        "oauth_custom_client_params": self.get_tenant_oauth_client(
+                            tenant_id, datasource_provider_id, mask=True
+                        ),
+                        "is_oauth_custom_client_enabled": self.is_tenant_oauth_params_enabled(
+                            tenant_id, datasource_provider_id
+                        ),
+                        "is_system_oauth_params_exists": self.is_system_oauth_params_exist(datasource_provider_id),
+                        "redirect_uri": redirect_uri,
+                    }
+                    if datasource.declaration.oauth_schema
+                    else None,
+                }
+            )
+        return datasource_credentials
+
+    def get_hard_code_datasource_credentials(self, tenant_id: str) -> list[dict]:
+        """
+        get hard code datasource credentials.
+
+        :return:
+        """
+        # get all plugin providers
+        manager = PluginDatasourceManager()
+        datasources = manager.fetch_installed_datasource_providers(tenant_id)
+        datasource_credentials = []
+        for datasource in datasources:
+            if datasource.plugin_id in [
+                "langgenius/firecrawl_datasource",
+                "langgenius/notion_datasource",
+                "langgenius/jina_datasource",
+            ]:
+                datasource_provider_id = DatasourceProviderID(f"{datasource.plugin_id}/{datasource.provider}")
+                credentials = self.list_datasource_credentials(
+                    tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
+                )
+                redirect_uri = "{}/console/api/oauth/plugin/{}/datasource/callback".format(
+                    dify_config.CONSOLE_API_URL, datasource_provider_id
+                )
+                datasource_credentials.append(
+                    {
+                        "provider": datasource.provider,
+                        "plugin_id": datasource.plugin_id,
+                        "plugin_unique_identifier": datasource.plugin_unique_identifier,
+                        "icon": datasource.declaration.identity.icon,
+                        "name": datasource.declaration.identity.name.split("/")[-1],
+                        "label": datasource.declaration.identity.label.model_dump(),
+                        "description": datasource.declaration.identity.description.model_dump(),
+                        "author": datasource.declaration.identity.author,
+                        "credentials_list": credentials,
+                        "credential_schema": [
+                            credential.model_dump() for credential in datasource.declaration.credentials_schema
+                        ],
+                        "oauth_schema": {
+                            "client_schema": [
+                                client_schema.model_dump()
+                                for client_schema in datasource.declaration.oauth_schema.client_schema
+                            ],
+                            "credentials_schema": [
+                                credential_schema.model_dump()
+                                for credential_schema in datasource.declaration.oauth_schema.credentials_schema
+                            ],
+                            "oauth_custom_client_params": self.get_tenant_oauth_client(
+                                tenant_id, datasource_provider_id, mask=True
+                            ),
+                            "is_oauth_custom_client_enabled": self.is_tenant_oauth_params_enabled(
+                                tenant_id, datasource_provider_id
+                            ),
+                            "is_system_oauth_params_exists": self.is_system_oauth_params_exist(datasource_provider_id),
+                            "redirect_uri": redirect_uri,
+                        }
+                        if datasource.declaration.oauth_schema
+                        else None,
+                    }
+                )
+        return datasource_credentials
+
+    def get_real_datasource_credentials(self, tenant_id: str, provider: str, plugin_id: str) -> list[dict]:
+        """
+        get datasource credentials.
+
+        :param tenant_id: workspace id
+        :param provider_id: provider id
+        :return:
+        """
+        # Get all provider configurations of the current workspace
+        datasource_providers: list[DatasourceProvider] = (
+            db.session.query(DatasourceProvider)
+            .where(
+                DatasourceProvider.tenant_id == tenant_id,
+                DatasourceProvider.provider == provider,
+                DatasourceProvider.plugin_id == plugin_id,
+            )
+            .all()
+        )
+        if not datasource_providers:
+            return []
+        copy_credentials_list = []
+        for datasource_provider in datasource_providers:
+            encrypted_credentials = datasource_provider.encrypted_credentials
+            # Get provider credential secret variables
+            credential_secret_variables = self.extract_secret_variables(
+                tenant_id=tenant_id,
+                provider_id=f"{plugin_id}/{provider}",
+                credential_type=CredentialType.of(datasource_provider.auth_type),
+            )
+
+            # Obfuscate provider credentials
+            copy_credentials = encrypted_credentials.copy()
+            for key, value in copy_credentials.items():
+                if key in credential_secret_variables:
+                    copy_credentials[key] = encrypter.decrypt_token(tenant_id, value)
+            copy_credentials_list.append(
+                {
+                    "credentials": copy_credentials,
+                    "type": datasource_provider.auth_type,
+                }
+            )
+
+        return copy_credentials_list
+
+    def update_datasource_credentials(
+        self, tenant_id: str, auth_id: str, provider: str, plugin_id: str, credentials: dict | None, name: str | None
+    ) -> None:
+        """
+        update datasource credentials.
+        """
+        with Session(db.engine) as session:
+            datasource_provider = (
+                session.query(DatasourceProvider)
+                .filter_by(tenant_id=tenant_id, id=auth_id, provider=provider, plugin_id=plugin_id)
+                .first()
+            )
+            if not datasource_provider:
+                raise ValueError("Datasource provider not found")
+            # update name
+            if name and name != datasource_provider.name:
+                if (
+                    session.query(DatasourceProvider)
+                    .filter_by(tenant_id=tenant_id, name=name, provider=provider, plugin_id=plugin_id)
+                    .count()
+                    > 0
+                ):
+                    raise ValueError("Authorization name is already exists")
+                datasource_provider.name = name
+
+            # update credentials
+            if credentials:
+                secret_variables = self.extract_secret_variables(
+                    tenant_id=tenant_id,
+                    provider_id=f"{plugin_id}/{provider}",
+                    credential_type=CredentialType.of(datasource_provider.auth_type),
+                )
+                original_credentials = {
+                    key: value if key not in secret_variables else encrypter.decrypt_token(tenant_id, value)
+                    for key, value in datasource_provider.encrypted_credentials.items()
+                }
+                new_credentials = {
+                    key: value if value != HIDDEN_VALUE else original_credentials.get(key, UNKNOWN_VALUE)
+                    for key, value in credentials.items()
+                }
+                try:
+                    self.provider_manager.validate_provider_credentials(
+                        tenant_id=tenant_id,
+                        user_id=current_user.id,
+                        provider=provider,
+                        plugin_id=plugin_id,
+                        credentials=new_credentials,
+                    )
+                except Exception as e:
+                    raise ValueError(f"Failed to validate credentials: {str(e)}")
+
+                encrypted_credentials = {}
+                for key, value in new_credentials.items():
+                    if key in secret_variables:
+                        encrypted_credentials[key] = encrypter.encrypt_token(tenant_id, value)
+                    else:
+                        encrypted_credentials[key] = value
+
+                datasource_provider.encrypted_credentials = encrypted_credentials
+            session.commit()
+
+    def remove_datasource_credentials(self, tenant_id: str, auth_id: str, provider: str, plugin_id: str) -> None:
+        """
+        remove datasource credentials.
+
+        :param tenant_id: workspace id
+        :param provider: provider name
+        :param plugin_id: plugin id
+        :return:
+        """
+        datasource_provider = (
+            db.session.query(DatasourceProvider)
+            .filter_by(tenant_id=tenant_id, id=auth_id, provider=provider, plugin_id=plugin_id)
+            .first()
+        )
+        if datasource_provider:
+            db.session.delete(datasource_provider)
+            db.session.commit()
--- a/api/services/enterprise/plugin_manager_service.py
+++ b/api/services/enterprise/plugin_manager_service.py
@ -49,7 +49,7 @@ class PluginManagerService:
        if not ret.get("result", False):
            raise CredentialPolicyViolationError("Credentials not available: Please use ENTERPRISE global credentials")

-        logger.debug(
+        logging.debug(
            "Credential policy compliance checked for %s with credential %s, result: %s",
            body.provider,
            body.dify_credential_id,
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
@ -23,6 +23,7 @@ class NotionPage(BaseModel):


 class NotionInfo(BaseModel):
+    credential_id: str
    workspace_id: str
    pages: list[NotionPage]

--- a/api/services/entities/knowledge_entities/rag_pipeline_entities.py
+++ b/api/services/entities/knowledge_entities/rag_pipeline_entities.py
@ -0,0 +1,130 @@
+from typing import Literal
+
+from pydantic import BaseModel, field_validator
+
+
+class IconInfo(BaseModel):
+    icon: str
+    icon_background: str | None = None
+    icon_type: str | None = None
+    icon_url: str | None = None
+
+
+class PipelineTemplateInfoEntity(BaseModel):
+    name: str
+    description: str
+    icon_info: IconInfo
+
+
+class RagPipelineDatasetCreateEntity(BaseModel):
+    name: str
+    description: str
+    icon_info: IconInfo
+    permission: str
+    partial_member_list: list[str] | None = None
+    yaml_content: str | None = None
+
+
+class RerankingModelConfig(BaseModel):
+    """
+    Reranking Model Config.
+    """
+
+    reranking_provider_name: str | None = ""
+    reranking_model_name: str | None = ""
+
+
+class VectorSetting(BaseModel):
+    """
+    Vector Setting.
+    """
+
+    vector_weight: float
+    embedding_provider_name: str
+    embedding_model_name: str
+
+
+class KeywordSetting(BaseModel):
+    """
+    Keyword Setting.
+    """
+
+    keyword_weight: float
+
+
+class WeightedScoreConfig(BaseModel):
+    """
+    Weighted score Config.
+    """
+
+    vector_setting: VectorSetting | None
+    keyword_setting: KeywordSetting | None
+
+
+class EmbeddingSetting(BaseModel):
+    """
+    Embedding Setting.
+    """
+
+    embedding_provider_name: str
+    embedding_model_name: str
+
+
+class EconomySetting(BaseModel):
+    """
+    Economy Setting.
+    """
+
+    keyword_number: int
+
+
+class RetrievalSetting(BaseModel):
+    """
+    Retrieval Setting.
+    """
+
+    search_method: Literal["semantic_search", "fulltext_search", "keyword_search", "hybrid_search"]
+    top_k: int
+    score_threshold: float | None = 0.5
+    score_threshold_enabled: bool = False
+    reranking_mode: str | None = "reranking_model"
+    reranking_enable: bool | None = True
+    reranking_model: RerankingModelConfig | None = None
+    weights: WeightedScoreConfig | None = None
+
+
+class IndexMethod(BaseModel):
+    """
+    Knowledge Index Setting.
+    """
+
+    indexing_technique: Literal["high_quality", "economy"]
+    embedding_setting: EmbeddingSetting
+    economy_setting: EconomySetting
+
+
+class KnowledgeConfiguration(BaseModel):
+    """
+    Knowledge Base Configuration.
+    """
+
+    chunk_structure: str
+    indexing_technique: Literal["high_quality", "economy"]
+    embedding_model_provider: str = ""
+    embedding_model: str = ""
+    keyword_number: int | None = 10
+    retrieval_model: RetrievalSetting
+
+    @field_validator("embedding_model_provider", mode="before")
+    @classmethod
+    def validate_embedding_model_provider(cls, v):
+        if v is None:
+            return ""
+        return v
+
+    @field_validator("embedding_model", mode="before")
+    @classmethod
+    def validate_embedding_model(cls, v):
+        if v is None:
+            return ""
+        return v
--- a/api/services/feature_service.py
+++ b/api/services/feature_service.py
@ -88,6 +88,10 @@ class WebAppAuthModel(BaseModel):
    allow_email_password_login: bool = False


+class KnowledgePipeline(BaseModel):
+    publish_enabled: bool = False
+
+
 class PluginInstallationScope(StrEnum):
    NONE = "none"
    OFFICIAL_ONLY = "official_only"
@ -126,6 +130,7 @@ class FeatureModel(BaseModel):
    is_allow_transfer_workspace: bool = True
    # pydantic configs
    model_config = ConfigDict(protected_namespaces=())
+    knowledge_pipeline: KnowledgePipeline = KnowledgePipeline()


 class KnowledgeRateLimitModel(BaseModel):
@ -271,6 +276,9 @@ class FeatureService:
        if "knowledge_rate_limit" in billing_info:
            features.knowledge_rate_limit = billing_info["knowledge_rate_limit"]["limit"]

+        if "knowledge_pipeline_publish_enabled" in billing_info:
+            features.knowledge_pipeline.publish_enabled = billing_info["knowledge_pipeline_publish_enabled"]
+
    @classmethod
    def _fulfill_params_from_enterprise(cls, features: SystemFeatureModel):
        enterprise_info = EnterpriseService.get_info()
--- a/api/services/file_service.py
+++ b/api/services/file_service.py
@ -3,6 +3,8 @@ import os
 import uuid
 from typing import Literal, Union

+from sqlalchemy import Engine
+from sqlalchemy.orm import sessionmaker
 from werkzeug.exceptions import NotFound

 from configs import dify_config
@ -14,11 +16,9 @@ from constants import (
 )
 from core.file import helpers as file_helpers
 from core.rag.extractor.extract_processor import ExtractProcessor
-from extensions.ext_database import db
 from extensions.ext_storage import storage
 from libs.datetime_utils import naive_utc_now
 from libs.helper import extract_tenant_id
-from libs.login import current_user
 from models.account import Account
 from models.enums import CreatorUserRole
 from models.model import EndUser, UploadFile
@ -29,8 +29,18 @@ PREVIEW_WORDS_LIMIT = 3000


 class FileService:
-    @staticmethod
+    _session_maker: sessionmaker
+
+    def __init__(self, session_factory: sessionmaker | Engine | None = None):
+        if isinstance(session_factory, Engine):
+            self._session_maker = sessionmaker(bind=session_factory)
+        elif isinstance(session_factory, sessionmaker):
+            self._session_maker = session_factory
+        else:
+            raise AssertionError("must be a sessionmaker or an Engine.")
+
    def upload_file(
+        self,
        *,
        filename: str,
        content: bytes,
@ -85,14 +95,14 @@ class FileService:
            hash=hashlib.sha3_256(content).hexdigest(),
            source_url=source_url,
        )
-
-        db.session.add(upload_file)
-        db.session.commit()
-
+        # The `UploadFile` ID is generated within its constructor, so flushing to retrieve the ID is unnecessary.
+        # We can directly generate the `source_url` here before committing.
        if not upload_file.source_url:
            upload_file.source_url = file_helpers.get_signed_file_url(upload_file_id=upload_file.id)
-            db.session.add(upload_file)
-            db.session.commit()
+
+        with self._session_maker(expire_on_commit=False) as session:
+            session.add(upload_file)
+            session.commit()

        return upload_file

@ -109,45 +119,42 @@ class FileService:

        return file_size <= file_size_limit

-    @staticmethod
-    def upload_text(text: str, text_name: str) -> UploadFile:
-        assert isinstance(current_user, Account)
-        assert current_user.current_tenant_id is not None
-
+    def upload_text(self, text: str, text_name: str, user_id: str, tenant_id: str) -> UploadFile:
        if len(text_name) > 200:
            text_name = text_name[:200]
        # user uuid as file name
        file_uuid = str(uuid.uuid4())
-        file_key = "upload_files/" + current_user.current_tenant_id + "/" + file_uuid + ".txt"
+        file_key = "upload_files/" + tenant_id + "/" + file_uuid + ".txt"

        # save file to storage
        storage.save(file_key, text.encode("utf-8"))

        # save file to db
        upload_file = UploadFile(
-            tenant_id=current_user.current_tenant_id,
+            tenant_id=tenant_id,
            storage_type=dify_config.STORAGE_TYPE,
            key=file_key,
            name=text_name,
            size=len(text),
            extension="txt",
            mime_type="text/plain",
-            created_by=current_user.id,
+            created_by=user_id,
            created_by_role=CreatorUserRole.ACCOUNT,
            created_at=naive_utc_now(),
            used=True,
-            used_by=current_user.id,
+            used_by=user_id,
            used_at=naive_utc_now(),
        )

-        db.session.add(upload_file)
-        db.session.commit()
+        with self._session_maker(expire_on_commit=False) as session:
+            session.add(upload_file)
+            session.commit()

        return upload_file

-    @staticmethod
-    def get_file_preview(file_id: str):
-        upload_file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
+    def get_file_preview(self, file_id: str):
+        with self._session_maker(expire_on_commit=False) as session:
+            upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first()

        if not upload_file:
            raise NotFound("File not found")
@ -162,15 +169,14 @@ class FileService:

        return text

-    @staticmethod
-    def get_image_preview(file_id: str, timestamp: str, nonce: str, sign: str):
+    def get_image_preview(self, file_id: str, timestamp: str, nonce: str, sign: str):
        result = file_helpers.verify_image_signature(
            upload_file_id=file_id, timestamp=timestamp, nonce=nonce, sign=sign
        )
        if not result:
            raise NotFound("File not found or signature is invalid")
-
-        upload_file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
+        with self._session_maker(expire_on_commit=False) as session:
+            upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first()

        if not upload_file:
            raise NotFound("File not found or signature is invalid")
@ -184,13 +190,13 @@ class FileService:

        return generator, upload_file.mime_type

-    @staticmethod
-    def get_file_generator_by_file_id(file_id: str, timestamp: str, nonce: str, sign: str):
+    def get_file_generator_by_file_id(self, file_id: str, timestamp: str, nonce: str, sign: str):
        result = file_helpers.verify_file_signature(upload_file_id=file_id, timestamp=timestamp, nonce=nonce, sign=sign)
        if not result:
            raise NotFound("File not found or signature is invalid")

-        upload_file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
+        with self._session_maker(expire_on_commit=False) as session:
+            upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first()

        if not upload_file:
            raise NotFound("File not found or signature is invalid")
@ -199,9 +205,9 @@ class FileService:

        return generator, upload_file

-    @staticmethod
-    def get_public_image_preview(file_id: str):
-        upload_file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
+    def get_public_image_preview(self, file_id: str):
+        with self._session_maker(expire_on_commit=False) as session:
+            upload_file = session.query(UploadFile).where(UploadFile.id == file_id).first()

        if not upload_file:
            raise NotFound("File not found or signature is invalid")
@ -214,3 +220,23 @@ class FileService:
        generator = storage.load(upload_file.key)

        return generator, upload_file.mime_type
+
+    def get_file_content(self, file_id: str) -> str:
+        with self._session_maker(expire_on_commit=False) as session:
+            upload_file: UploadFile | None = session.query(UploadFile).where(UploadFile.id == file_id).first()
+
+        if not upload_file:
+            raise NotFound("File not found")
+        content = storage.load(upload_file.key)
+
+        return content.decode("utf-8")
+
+    def delete_file(self, file_id: str):
+        with self._session_maker(expire_on_commit=False) as session:
+            upload_file: UploadFile | None = session.query(UploadFile).where(UploadFile.id == file_id).first()
+
+        if not upload_file:
+            return
+        storage.delete(upload_file.key)
+        session.delete(upload_file)
+        session.commit()
--- a/api/services/message_service.py
+++ b/api/services/message_service.py
@ -241,6 +241,9 @@ class MessageService:

            app_config = AdvancedChatAppConfigManager.get_app_config(app_model=app_model, workflow=workflow)

+            if not app_config.additional_features:
+                raise ValueError("Additional features not found")
+
            if not app_config.additional_features.suggested_questions_after_answer:
                raise SuggestedQuestionsAfterAnswerDisabledError()

--- a/api/services/plugin/data_migration.py
+++ b/api/services/plugin/data_migration.py
@ -4,8 +4,8 @@ import logging
 import click
 import sqlalchemy as sa

-from core.plugin.entities.plugin import GenericProviderID, ModelProviderID, ToolProviderID
-from models.engine import db
+from extensions.ext_database import db
+from models.provider_ids import GenericProviderID, ModelProviderID, ToolProviderID

 logger = logging.getLogger(__name__)

--- a/api/services/plugin/dependencies_analysis.py
+++ b/api/services/plugin/dependencies_analysis.py
@ -1,7 +1,13 @@
+import re
+
 from configs import dify_config
 from core.helper import marketplace
-from core.plugin.entities.plugin import ModelProviderID, PluginDependency, PluginInstallationSource, ToolProviderID
+from core.plugin.entities.plugin import PluginDependency, PluginInstallationSource
 from core.plugin.impl.plugin import PluginInstaller
+from models.provider_ids import ModelProviderID, ToolProviderID
+
+# Compile regex pattern for version extraction at module level for better performance
+_VERSION_REGEX = re.compile(r":(?P<version>[0-9]+(?:\.[0-9]+){2}(?:[+-][0-9A-Za-z.-]+)?)(?:@|$)")


 class DependenciesAnalysisService:
@ -48,6 +54,13 @@ class DependenciesAnalysisService:
        for dependency in dependencies:
            unique_identifier = dependency.value.plugin_unique_identifier
            if unique_identifier in missing_plugin_unique_identifiers:
+                # Extract version for Marketplace dependencies
+                if dependency.type == PluginDependency.Type.Marketplace:
+                    version_match = _VERSION_REGEX.search(unique_identifier)
+                    if version_match:
+                        dependency.value.version = version_match.group("version")
+
+                # Create and append the dependency (same for all types)
                leaked_dependencies.append(
                    PluginDependency(
                        type=dependency.type,
--- a/api/services/plugin/oauth_service.py
+++ b/api/services/plugin/oauth_service.py
@ -11,7 +11,13 @@ class OAuthProxyService(BasePluginClient):
    __KEY_PREFIX__ = "oauth_proxy_context:"

    @staticmethod
-    def create_proxy_context(user_id: str, tenant_id: str, plugin_id: str, provider: str):
+    def create_proxy_context(
+        user_id: str,
+        tenant_id: str,
+        plugin_id: str,
+        provider: str,
+        credential_id: str | None = None,
+    ):
        """
        Create a proxy context for an OAuth 2.0 authorization request.

@ -31,6 +37,8 @@ class OAuthProxyService(BasePluginClient):
            "tenant_id": tenant_id,
            "provider": provider,
        }
+        if credential_id:
+            data["credential_id"] = credential_id
        redis_client.setex(
            f"{OAuthProxyService.__KEY_PREFIX__}{context_id}",
            OAuthProxyService.__MAX_AGE__,
--- a/api/services/plugin/plugin_migration.py
+++ b/api/services/plugin/plugin_migration.py
@ -16,15 +16,17 @@ from sqlalchemy.orm import Session

 from core.agent.entities import AgentToolEntity
 from core.helper import marketplace
-from core.plugin.entities.plugin import ModelProviderID, PluginInstallationSource, ToolProviderID
+from core.plugin.entities.plugin import PluginInstallationSource
 from core.plugin.entities.plugin_daemon import PluginInstallTaskStatus
 from core.plugin.impl.plugin import PluginInstaller
 from core.tools.entities.tool_entities import ToolProviderType
+from extensions.ext_database import db
 from models.account import Tenant
-from models.engine import db
 from models.model import App, AppMode, AppModelConfig
+from models.provider_ids import ModelProviderID, ToolProviderID
 from models.tools import BuiltinToolProvider
 from models.workflow import Workflow
+from services.plugin.plugin_service import PluginService

 logger = logging.getLogger(__name__)

@ -421,6 +423,94 @@ class PluginMigration:
            )
        )

+    @classmethod
+    def install_rag_pipeline_plugins(cls, extracted_plugins: str, output_file: str, workers: int = 100) -> None:
+        """
+        Install rag pipeline plugins.
+        """
+        manager = PluginInstaller()
+
+        plugins = cls.extract_unique_plugins(extracted_plugins)
+        plugin_install_failed = []
+
+        # use a fake tenant id to install all the plugins
+        fake_tenant_id = uuid4().hex
+        logger.info("Installing %s plugin instances for fake tenant %s", len(plugins["plugins"]), fake_tenant_id)
+
+        thread_pool = ThreadPoolExecutor(max_workers=workers)
+
+        response = cls.handle_plugin_instance_install(fake_tenant_id, plugins["plugins"])
+        if response.get("failed"):
+            plugin_install_failed.extend(response.get("failed", []))
+
+        def install(
+            tenant_id: str, plugin_ids: dict[str, str], total_success_tenant: int, total_failed_tenant: int
+        ) -> None:
+            logger.info("Installing %s plugins for tenant %s", len(plugin_ids), tenant_id)
+            try:
+                # fetch plugin already installed
+                installed_plugins = manager.list_plugins(tenant_id)
+                installed_plugins_ids = [plugin.plugin_id for plugin in installed_plugins]
+                # at most 64 plugins one batch
+                for i in range(0, len(plugin_ids), 64):
+                    batch_plugin_ids = list(plugin_ids.keys())[i : i + 64]
+                    batch_plugin_identifiers = [
+                        plugin_ids[plugin_id]
+                        for plugin_id in batch_plugin_ids
+                        if plugin_id not in installed_plugins_ids and plugin_id in plugin_ids
+                    ]
+                    PluginService.install_from_marketplace_pkg(tenant_id, batch_plugin_identifiers)
+
+                total_success_tenant += 1
+            except Exception:
+                logger.exception("Failed to install plugins for tenant %s", tenant_id)
+                total_failed_tenant += 1
+
+        page = 1
+        total_success_tenant = 0
+        total_failed_tenant = 0
+        while True:
+            # paginate
+            tenants = db.paginate(db.select(Tenant).order_by(Tenant.created_at.desc()), page=page, per_page=100)
+            if tenants.items is None or len(tenants.items) == 0:
+                break
+
+            for tenant in tenants:
+                tenant_id = tenant.id
+                # get plugin unique identifier
+                thread_pool.submit(
+                    install,
+                    tenant_id,
+                    plugins.get("plugins", {}),
+                    total_success_tenant,
+                    total_failed_tenant,
+                )
+
+            page += 1
+
+        thread_pool.shutdown(wait=True)
+
+        # uninstall all the plugins for fake tenant
+        try:
+            installation = manager.list_plugins(fake_tenant_id)
+            while installation:
+                for plugin in installation:
+                    manager.uninstall(fake_tenant_id, plugin.installation_id)
+
+                installation = manager.list_plugins(fake_tenant_id)
+        except Exception:
+            logger.exception("Failed to get installation for tenant %s", fake_tenant_id)
+
+        Path(output_file).write_text(
+            json.dumps(
+                {
+                    "total_success_tenant": total_success_tenant,
+                    "total_failed_tenant": total_failed_tenant,
+                    "plugin_install_failed": plugin_install_failed,
+                }
+            )
+        )
+
    @classmethod
    def handle_plugin_instance_install(
        cls, tenant_id: str, plugin_identifiers_map: Mapping[str, str]
--- a/api/services/plugin/plugin_service.py
+++ b/api/services/plugin/plugin_service.py
@ -10,7 +10,6 @@ from core.helper.download import download_with_size_limit
 from core.helper.marketplace import download_plugin_pkg
 from core.plugin.entities.bundle import PluginBundleDependency
 from core.plugin.entities.plugin import (
-    GenericProviderID,
    PluginDeclaration,
    PluginEntity,
    PluginInstallation,
@ -26,6 +25,7 @@ from core.plugin.impl.asset import PluginAssetManager
 from core.plugin.impl.debugging import PluginDebuggingClient
 from core.plugin.impl.plugin import PluginInstaller
 from extensions.ext_redis import redis_client
+from models.provider_ids import GenericProviderID
 from services.errors.plugin import PluginInstallationForbiddenError
 from services.feature_service import FeatureService, PluginInstallationScope

--- a/api/services/rag_pipeline/entity/pipeline_service_api_entities.py
+++ b/api/services/rag_pipeline/entity/pipeline_service_api_entities.py
@ -0,0 +1,22 @@
+from collections.abc import Mapping
+from typing import Any
+
+from pydantic import BaseModel
+
+
+class DatasourceNodeRunApiEntity(BaseModel):
+    pipeline_id: str
+    node_id: str
+    inputs: dict[str, Any]
+    datasource_type: str
+    credential_id: str | None = None
+    is_published: bool
+
+
+class PipelineRunApiEntity(BaseModel):
+    inputs: Mapping[str, Any]
+    datasource_type: str
+    datasource_info_list: list[Mapping[str, Any]]
+    start_node_id: str
+    is_published: bool
+    response_mode: str
--- a/api/services/rag_pipeline/pipeline_generate_service.py
+++ b/api/services/rag_pipeline/pipeline_generate_service.py
@ -0,0 +1,115 @@
+from collections.abc import Mapping
+from typing import Any, Union
+
+from configs import dify_config
+from core.app.apps.pipeline.pipeline_generator import PipelineGenerator
+from core.app.entities.app_invoke_entities import InvokeFrom
+from extensions.ext_database import db
+from models.dataset import Document, Pipeline
+from models.model import Account, App, EndUser
+from models.workflow import Workflow
+from services.rag_pipeline.rag_pipeline import RagPipelineService
+
+
+class PipelineGenerateService:
+    @classmethod
+    def generate(
+        cls,
+        pipeline: Pipeline,
+        user: Union[Account, EndUser],
+        args: Mapping[str, Any],
+        invoke_from: InvokeFrom,
+        streaming: bool = True,
+    ):
+        """
+        Pipeline Content Generate
+        :param pipeline: pipeline
+        :param user: user
+        :param args: args
+        :param invoke_from: invoke from
+        :param streaming: streaming
+        :return:
+        """
+        try:
+            workflow = cls._get_workflow(pipeline, invoke_from)
+            if original_document_id := args.get("original_document_id"):
+                # update document status to waiting
+                cls.update_document_status(original_document_id)
+            return PipelineGenerator.convert_to_event_stream(
+                PipelineGenerator().generate(
+                    pipeline=pipeline,
+                    workflow=workflow,
+                    user=user,
+                    args=args,
+                    invoke_from=invoke_from,
+                    streaming=streaming,
+                    call_depth=0,
+                    workflow_thread_pool_id=None,
+                ),
+            )
+
+        except Exception:
+            raise
+
+    @staticmethod
+    def _get_max_active_requests(app_model: App) -> int:
+        max_active_requests = app_model.max_active_requests
+        if max_active_requests is None:
+            max_active_requests = int(dify_config.APP_MAX_ACTIVE_REQUESTS)
+        return max_active_requests
+
+    @classmethod
+    def generate_single_iteration(
+        cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True
+    ):
+        workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
+        return PipelineGenerator.convert_to_event_stream(
+            PipelineGenerator().single_iteration_generate(
+                pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
+            )
+        )
+
+    @classmethod
+    def generate_single_loop(cls, pipeline: Pipeline, user: Account, node_id: str, args: Any, streaming: bool = True):
+        workflow = cls._get_workflow(pipeline, InvokeFrom.DEBUGGER)
+        return PipelineGenerator.convert_to_event_stream(
+            PipelineGenerator().single_loop_generate(
+                pipeline=pipeline, workflow=workflow, node_id=node_id, user=user, args=args, streaming=streaming
+            )
+        )
+
+    @classmethod
+    def _get_workflow(cls, pipeline: Pipeline, invoke_from: InvokeFrom) -> Workflow:
+        """
+        Get workflow
+        :param pipeline: pipeline
+        :param invoke_from: invoke from
+        :return:
+        """
+        rag_pipeline_service = RagPipelineService()
+        if invoke_from == InvokeFrom.DEBUGGER:
+            # fetch draft workflow by app_model
+            workflow = rag_pipeline_service.get_draft_workflow(pipeline=pipeline)
+
+            if not workflow:
+                raise ValueError("Workflow not initialized")
+        else:
+            # fetch published workflow by app_model
+            workflow = rag_pipeline_service.get_published_workflow(pipeline=pipeline)
+
+            if not workflow:
+                raise ValueError("Workflow not published")
+
+        return workflow
+
+    @classmethod
+    def update_document_status(cls, document_id: str):
+        """
+        Update document status to waiting
+        :param document_id: document id
+        """
+        document = db.session.query(Document).where(Document.id == document_id).first()
+        if document:
+            document.indexing_status = "waiting"
+            db.session.add(document)
+            db.session.commit()
--- a/api/services/rag_pipeline/pipeline_template/init.py
+++ b/api/services/rag_pipeline/pipeline_template/init.py
--- a/api/services/rag_pipeline/pipeline_template/built_in/init.py
+++ b/api/services/rag_pipeline/pipeline_template/built_in/init.py
--- a/api/services/rag_pipeline/pipeline_template/built_in/built_in_retrieval.py
+++ b/api/services/rag_pipeline/pipeline_template/built_in/built_in_retrieval.py
@ -0,0 +1,63 @@
+import json
+from os import path
+from pathlib import Path
+
+from flask import current_app
+
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+
+
+class BuiltInPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
+    """
+    Retrieval pipeline template from built-in, the location  is constants/pipeline_templates.json
+    """
+
+    builtin_data: dict | None = None
+
+    def get_type(self) -> str:
+        return PipelineTemplateType.BUILTIN
+
+    def get_pipeline_templates(self, language: str) -> dict:
+        result = self.fetch_pipeline_templates_from_builtin(language)
+        return result
+
+    def get_pipeline_template_detail(self, template_id: str):
+        result = self.fetch_pipeline_template_detail_from_builtin(template_id)
+        return result
+
+    @classmethod
+    def _get_builtin_data(cls) -> dict:
+        """
+        Get builtin data.
+        :return:
+        """
+        if cls.builtin_data:
+            return cls.builtin_data
+
+        root_path = current_app.root_path
+        cls.builtin_data = json.loads(
+            Path(path.join(root_path, "constants", "pipeline_templates.json")).read_text(encoding="utf-8")
+        )
+
+        return cls.builtin_data or {}
+
+    @classmethod
+    def fetch_pipeline_templates_from_builtin(cls, language: str) -> dict:
+        """
+        Fetch pipeline templates from builtin.
+        :param language: language
+        :return:
+        """
+        builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
+        return builtin_data.get("pipeline_templates", {}).get(language, {})
+
+    @classmethod
+    def fetch_pipeline_template_detail_from_builtin(cls, template_id: str) -> dict | None:
+        """
+        Fetch pipeline template detail from builtin.
+        :param template_id: Template ID
+        :return:
+        """
+        builtin_data: dict[str, dict[str, dict]] = cls._get_builtin_data()
+        return builtin_data.get("pipeline_templates", {}).get(template_id)
--- a/api/services/rag_pipeline/pipeline_template/customized/init.py
+++ b/api/services/rag_pipeline/pipeline_template/customized/init.py
--- a/api/services/rag_pipeline/pipeline_template/customized/customized_retrieval.py
+++ b/api/services/rag_pipeline/pipeline_template/customized/customized_retrieval.py
@ -0,0 +1,81 @@
+import yaml
+from flask_login import current_user
+
+from extensions.ext_database import db
+from models.dataset import PipelineCustomizedTemplate
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+
+
+class CustomizedPipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
+    """
+    Retrieval recommended app from database
+    """
+
+    def get_pipeline_templates(self, language: str) -> dict:
+        result = self.fetch_pipeline_templates_from_customized(
+            tenant_id=current_user.current_tenant_id, language=language
+        )
+        return result
+
+    def get_pipeline_template_detail(self, template_id: str):
+        result = self.fetch_pipeline_template_detail_from_db(template_id)
+        return result
+
+    def get_type(self) -> str:
+        return PipelineTemplateType.CUSTOMIZED
+
+    @classmethod
+    def fetch_pipeline_templates_from_customized(cls, tenant_id: str, language: str) -> dict:
+        """
+        Fetch pipeline templates from db.
+        :param tenant_id: tenant id
+        :param language: language
+        :return:
+        """
+        pipeline_customized_templates = (
+            db.session.query(PipelineCustomizedTemplate)
+            .where(PipelineCustomizedTemplate.tenant_id == tenant_id, PipelineCustomizedTemplate.language == language)
+            .order_by(PipelineCustomizedTemplate.position.asc(), PipelineCustomizedTemplate.created_at.desc())
+            .all()
+        )
+        recommended_pipelines_results = []
+        for pipeline_customized_template in pipeline_customized_templates:
+            recommended_pipeline_result = {
+                "id": pipeline_customized_template.id,
+                "name": pipeline_customized_template.name,
+                "description": pipeline_customized_template.description,
+                "icon": pipeline_customized_template.icon,
+                "position": pipeline_customized_template.position,
+                "chunk_structure": pipeline_customized_template.chunk_structure,
+            }
+            recommended_pipelines_results.append(recommended_pipeline_result)
+
+        return {"pipeline_templates": recommended_pipelines_results}
+
+    @classmethod
+    def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
+        """
+        Fetch pipeline template detail from db.
+        :param template_id: Template ID
+        :return:
+        """
+        pipeline_template = (
+            db.session.query(PipelineCustomizedTemplate).where(PipelineCustomizedTemplate.id == template_id).first()
+        )
+        if not pipeline_template:
+            return None
+
+        dsl_data = yaml.safe_load(pipeline_template.yaml_content)
+        graph_data = dsl_data.get("workflow", {}).get("graph", {})
+
+        return {
+            "id": pipeline_template.id,
+            "name": pipeline_template.name,
+            "icon_info": pipeline_template.icon,
+            "description": pipeline_template.description,
+            "chunk_structure": pipeline_template.chunk_structure,
+            "export_data": pipeline_template.yaml_content,
+            "graph": graph_data,
+            "created_by": pipeline_template.created_user_name,
+        }
--- a/api/services/rag_pipeline/pipeline_template/database/init.py
+++ b/api/services/rag_pipeline/pipeline_template/database/init.py
--- a/api/services/rag_pipeline/pipeline_template/database/database_retrieval.py
+++ b/api/services/rag_pipeline/pipeline_template/database/database_retrieval.py
@ -0,0 +1,78 @@
+import yaml
+
+from extensions.ext_database import db
+from models.dataset import PipelineBuiltInTemplate
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+
+
+class DatabasePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
+    """
+    Retrieval pipeline   template from database
+    """
+
+    def get_pipeline_templates(self, language: str) -> dict:
+        result = self.fetch_pipeline_templates_from_db(language)
+        return result
+
+    def get_pipeline_template_detail(self, template_id: str):
+        result = self.fetch_pipeline_template_detail_from_db(template_id)
+        return result
+
+    def get_type(self) -> str:
+        return PipelineTemplateType.DATABASE
+
+    @classmethod
+    def fetch_pipeline_templates_from_db(cls, language: str) -> dict:
+        """
+        Fetch pipeline templates from db.
+        :param language: language
+        :return:
+        """
+
+        pipeline_built_in_templates: list[PipelineBuiltInTemplate] = (
+            db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.language == language).all()
+        )
+
+        recommended_pipelines_results = []
+        for pipeline_built_in_template in pipeline_built_in_templates:
+            recommended_pipeline_result = {
+                "id": pipeline_built_in_template.id,
+                "name": pipeline_built_in_template.name,
+                "description": pipeline_built_in_template.description,
+                "icon": pipeline_built_in_template.icon,
+                "copyright": pipeline_built_in_template.copyright,
+                "privacy_policy": pipeline_built_in_template.privacy_policy,
+                "position": pipeline_built_in_template.position,
+                "chunk_structure": pipeline_built_in_template.chunk_structure,
+            }
+            recommended_pipelines_results.append(recommended_pipeline_result)
+
+        return {"pipeline_templates": recommended_pipelines_results}
+
+    @classmethod
+    def fetch_pipeline_template_detail_from_db(cls, template_id: str) -> dict | None:
+        """
+        Fetch pipeline template detail from db.
+        :param pipeline_id: Pipeline ID
+        :return:
+        """
+        # is in public recommended list
+        pipeline_template = (
+            db.session.query(PipelineBuiltInTemplate).where(PipelineBuiltInTemplate.id == template_id).first()
+        )
+
+        if not pipeline_template:
+            return None
+        dsl_data = yaml.safe_load(pipeline_template.yaml_content)
+        graph_data = dsl_data.get("workflow", {}).get("graph", {})
+        return {
+            "id": pipeline_template.id,
+            "name": pipeline_template.name,
+            "icon_info": pipeline_template.icon,
+            "description": pipeline_template.description,
+            "chunk_structure": pipeline_template.chunk_structure,
+            "export_data": pipeline_template.yaml_content,
+            "graph": graph_data,
+            "created_by": pipeline_template.created_user_name,
+        }
--- a/api/services/rag_pipeline/pipeline_template/pipeline_template_base.py
+++ b/api/services/rag_pipeline/pipeline_template/pipeline_template_base.py
@ -0,0 +1,17 @@
+from abc import ABC, abstractmethod
+
+
+class PipelineTemplateRetrievalBase(ABC):
+    """Interface for pipeline template retrieval."""
+
+    @abstractmethod
+    def get_pipeline_templates(self, language: str) -> dict:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_pipeline_template_detail(self, template_id: str) -> dict | None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_type(self) -> str:
+        raise NotImplementedError
--- a/api/services/rag_pipeline/pipeline_template/pipeline_template_factory.py
+++ b/api/services/rag_pipeline/pipeline_template/pipeline_template_factory.py
@ -0,0 +1,26 @@
+from services.rag_pipeline.pipeline_template.built_in.built_in_retrieval import BuiltInPipelineTemplateRetrieval
+from services.rag_pipeline.pipeline_template.customized.customized_retrieval import CustomizedPipelineTemplateRetrieval
+from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+from services.rag_pipeline.pipeline_template.remote.remote_retrieval import RemotePipelineTemplateRetrieval
+
+
+class PipelineTemplateRetrievalFactory:
+    @staticmethod
+    def get_pipeline_template_factory(mode: str) -> type[PipelineTemplateRetrievalBase]:
+        match mode:
+            case PipelineTemplateType.REMOTE:
+                return RemotePipelineTemplateRetrieval
+            case PipelineTemplateType.CUSTOMIZED:
+                return CustomizedPipelineTemplateRetrieval
+            case PipelineTemplateType.DATABASE:
+                return DatabasePipelineTemplateRetrieval
+            case PipelineTemplateType.BUILTIN:
+                return BuiltInPipelineTemplateRetrieval
+            case _:
+                raise ValueError(f"invalid fetch recommended apps mode: {mode}")
+
+    @staticmethod
+    def get_built_in_pipeline_template_retrieval():
+        return BuiltInPipelineTemplateRetrieval
--- a/api/services/rag_pipeline/pipeline_template/pipeline_template_type.py
+++ b/api/services/rag_pipeline/pipeline_template/pipeline_template_type.py
@ -0,0 +1,8 @@
+from enum import StrEnum
+
+
+class PipelineTemplateType(StrEnum):
+    REMOTE = "remote"
+    DATABASE = "database"
+    CUSTOMIZED = "customized"
+    BUILTIN = "builtin"
--- a/api/services/rag_pipeline/pipeline_template/remote/init.py
+++ b/api/services/rag_pipeline/pipeline_template/remote/init.py
--- a/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py
+++ b/api/services/rag_pipeline/pipeline_template/remote/remote_retrieval.py
@ -0,0 +1,67 @@
+import logging
+
+import requests
+
+from configs import dify_config
+from services.rag_pipeline.pipeline_template.database.database_retrieval import DatabasePipelineTemplateRetrieval
+from services.rag_pipeline.pipeline_template.pipeline_template_base import PipelineTemplateRetrievalBase
+from services.rag_pipeline.pipeline_template.pipeline_template_type import PipelineTemplateType
+
+logger = logging.getLogger(__name__)
+
+
+class RemotePipelineTemplateRetrieval(PipelineTemplateRetrievalBase):
+    """
+    Retrieval recommended app from dify official
+    """
+
+    def get_pipeline_template_detail(self, template_id: str):
+        try:
+            result = self.fetch_pipeline_template_detail_from_dify_official(template_id)
+        except Exception as e:
+            logger.warning("fetch recommended app detail from dify official failed: %r, switch to database.", e)
+            result = DatabasePipelineTemplateRetrieval.fetch_pipeline_template_detail_from_db(template_id)
+        return result
+
+    def get_pipeline_templates(self, language: str) -> dict:
+        try:
+            result = self.fetch_pipeline_templates_from_dify_official(language)
+        except Exception as e:
+            logger.warning("fetch pipeline templates from dify official failed: %r, switch to database.", e)
+            result = DatabasePipelineTemplateRetrieval.fetch_pipeline_templates_from_db(language)
+        return result
+
+    def get_type(self) -> str:
+        return PipelineTemplateType.REMOTE
+
+    @classmethod
+    def fetch_pipeline_template_detail_from_dify_official(cls, template_id: str) -> dict | None:
+        """
+        Fetch pipeline template detail from dify official.
+        :param template_id: Pipeline ID
+        :return:
+        """
+        domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
+        url = f"{domain}/pipeline-templates/{template_id}"
+        response = requests.get(url, timeout=(3, 10))
+        if response.status_code != 200:
+            return None
+        data: dict = response.json()
+        return data
+
+    @classmethod
+    def fetch_pipeline_templates_from_dify_official(cls, language: str) -> dict:
+        """
+        Fetch pipeline templates from dify official.
+        :param language: language
+        :return:
+        """
+        domain = dify_config.HOSTED_FETCH_PIPELINE_TEMPLATES_REMOTE_DOMAIN
+        url = f"{domain}/pipeline-templates?language={language}"
+        response = requests.get(url, timeout=(3, 10))
+        if response.status_code != 200:
+            raise ValueError(f"fetch pipeline templates failed, status code: {response.status_code}")
+
+        result: dict = response.json()
+
+        return result
--- a/api/services/rag_pipeline/rag_pipeline.py
+++ b/api/services/rag_pipeline/rag_pipeline.py
--- a/api/services/rag_pipeline/rag_pipeline_dsl_service.py
+++ b/api/services/rag_pipeline/rag_pipeline_dsl_service.py
@ -0,0 +1,932 @@
+import base64
+import hashlib
+import json
+import logging
+import uuid
+from collections.abc import Mapping
+from datetime import UTC, datetime
+from enum import StrEnum
+from typing import cast
+from urllib.parse import urlparse
+from uuid import uuid4
+
+import yaml  # type: ignore
+from Crypto.Cipher import AES
+from Crypto.Util.Padding import pad, unpad
+from flask_login import current_user
+from packaging import version
+from pydantic import BaseModel, Field
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from core.helper import ssrf_proxy
+from core.helper.name_generator import generate_incremental_name
+from core.model_runtime.utils.encoders import jsonable_encoder
+from core.plugin.entities.plugin import PluginDependency
+from core.workflow.enums import NodeType
+from core.workflow.nodes.datasource.entities import DatasourceNodeData
+from core.workflow.nodes.knowledge_retrieval.entities import KnowledgeRetrievalNodeData
+from core.workflow.nodes.llm.entities import LLMNodeData
+from core.workflow.nodes.parameter_extractor.entities import ParameterExtractorNodeData
+from core.workflow.nodes.question_classifier.entities import QuestionClassifierNodeData
+from core.workflow.nodes.tool.entities import ToolNodeData
+from extensions.ext_redis import redis_client
+from factories import variable_factory
+from models import Account
+from models.dataset import Dataset, DatasetCollectionBinding, Pipeline
+from models.workflow import Workflow, WorkflowType
+from services.entities.knowledge_entities.rag_pipeline_entities import (
+    IconInfo,
+    KnowledgeConfiguration,
+    RagPipelineDatasetCreateEntity,
+)
+from services.plugin.dependencies_analysis import DependenciesAnalysisService
+
+logger = logging.getLogger(__name__)
+
+IMPORT_INFO_REDIS_KEY_PREFIX = "app_import_info:"
+CHECK_DEPENDENCIES_REDIS_KEY_PREFIX = "app_check_dependencies:"
+IMPORT_INFO_REDIS_EXPIRY = 10 * 60  # 10 minutes
+DSL_MAX_SIZE = 10 * 1024 * 1024  # 10MB
+CURRENT_DSL_VERSION = "0.1.0"
+
+
+class ImportMode(StrEnum):
+    YAML_CONTENT = "yaml-content"
+    YAML_URL = "yaml-url"
+
+
+class ImportStatus(StrEnum):
+    COMPLETED = "completed"
+    COMPLETED_WITH_WARNINGS = "completed-with-warnings"
+    PENDING = "pending"
+    FAILED = "failed"
+
+
+class RagPipelineImportInfo(BaseModel):
+    id: str
+    status: ImportStatus
+    pipeline_id: str | None = None
+    current_dsl_version: str = CURRENT_DSL_VERSION
+    imported_dsl_version: str = ""
+    error: str = ""
+    dataset_id: str | None = None
+
+
+class CheckDependenciesResult(BaseModel):
+    leaked_dependencies: list[PluginDependency] = Field(default_factory=list)
+
+
+def _check_version_compatibility(imported_version: str) -> ImportStatus:
+    """Determine import status based on version comparison"""
+    try:
+        current_ver = version.parse(CURRENT_DSL_VERSION)
+        imported_ver = version.parse(imported_version)
+    except version.InvalidVersion:
+        return ImportStatus.FAILED
+
+    # If imported version is newer than current, always return PENDING
+    if imported_ver > current_ver:
+        return ImportStatus.PENDING
+
+    # If imported version is older than current's major, return PENDING
+    if imported_ver.major < current_ver.major:
+        return ImportStatus.PENDING
+
+    # If imported version is older than current's minor, return COMPLETED_WITH_WARNINGS
+    if imported_ver.minor < current_ver.minor:
+        return ImportStatus.COMPLETED_WITH_WARNINGS
+
+    # If imported version equals or is older than current's micro, return COMPLETED
+    return ImportStatus.COMPLETED
+
+
+class RagPipelinePendingData(BaseModel):
+    import_mode: str
+    yaml_content: str
+    pipeline_id: str | None
+
+
+class CheckDependenciesPendingData(BaseModel):
+    dependencies: list[PluginDependency]
+    pipeline_id: str | None
+
+
+class RagPipelineDslService:
+    def __init__(self, session: Session):
+        self._session = session
+
+    def import_rag_pipeline(
+        self,
+        *,
+        account: Account,
+        import_mode: str,
+        yaml_content: str | None = None,
+        yaml_url: str | None = None,
+        pipeline_id: str | None = None,
+        dataset: Dataset | None = None,
+        dataset_name: str | None = None,
+        icon_info: IconInfo | None = None,
+    ) -> RagPipelineImportInfo:
+        """Import an app from YAML content or URL."""
+        import_id = str(uuid.uuid4())
+
+        # Validate import mode
+        try:
+            mode = ImportMode(import_mode)
+        except ValueError:
+            raise ValueError(f"Invalid import_mode: {import_mode}")
+
+        # Get YAML content
+        content: str = ""
+        if mode == ImportMode.YAML_URL:
+            if not yaml_url:
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="yaml_url is required when import_mode is yaml-url",
+                )
+            try:
+                parsed_url = urlparse(yaml_url)
+                if (
+                    parsed_url.scheme == "https"
+                    and parsed_url.netloc == "github.com"
+                    and parsed_url.path.endswith((".yml", ".yaml"))
+                ):
+                    yaml_url = yaml_url.replace("https://github.com", "https://raw.githubusercontent.com")
+                    yaml_url = yaml_url.replace("/blob/", "/")
+                response = ssrf_proxy.get(yaml_url.strip(), follow_redirects=True, timeout=(10, 10))
+                response.raise_for_status()
+                content = response.content.decode()
+
+                if len(content) > DSL_MAX_SIZE:
+                    return RagPipelineImportInfo(
+                        id=import_id,
+                        status=ImportStatus.FAILED,
+                        error="File size exceeds the limit of 10MB",
+                    )
+
+                if not content:
+                    return RagPipelineImportInfo(
+                        id=import_id,
+                        status=ImportStatus.FAILED,
+                        error="Empty content from url",
+                    )
+            except Exception as e:
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error=f"Error fetching YAML from URL: {str(e)}",
+                )
+        elif mode == ImportMode.YAML_CONTENT:
+            if not yaml_content:
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="yaml_content is required when import_mode is yaml-content",
+                )
+            content = yaml_content
+
+        # Process YAML content
+        try:
+            # Parse YAML to validate format
+            data = yaml.safe_load(content)
+            if not isinstance(data, dict):
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="Invalid YAML format: content must be a mapping",
+                )
+
+            # Validate and fix DSL version
+            if not data.get("version"):
+                data["version"] = "0.1.0"
+            if not data.get("kind") or data.get("kind") != "rag_pipeline":
+                data["kind"] = "rag_pipeline"
+
+            imported_version = data.get("version", "0.1.0")
+            # check if imported_version is a float-like string
+            if not isinstance(imported_version, str):
+                raise ValueError(f"Invalid version type, expected str, got {type(imported_version)}")
+            status = _check_version_compatibility(imported_version)
+
+            # Extract app data
+            pipeline_data = data.get("rag_pipeline")
+            if not pipeline_data:
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="Missing rag_pipeline data in YAML content",
+                )
+
+            # If app_id is provided, check if it exists
+            pipeline = None
+            if pipeline_id:
+                stmt = select(Pipeline).where(
+                    Pipeline.id == pipeline_id,
+                    Pipeline.tenant_id == account.current_tenant_id,
+                )
+                pipeline = self._session.scalar(stmt)
+
+                if not pipeline:
+                    return RagPipelineImportInfo(
+                        id=import_id,
+                        status=ImportStatus.FAILED,
+                        error="Pipeline not found",
+                    )
+                dataset = pipeline.retrieve_dataset(session=self._session)
+
+            # If major version mismatch, store import info in Redis
+            if status == ImportStatus.PENDING:
+                pending_data = RagPipelinePendingData(
+                    import_mode=import_mode,
+                    yaml_content=content,
+                    pipeline_id=pipeline_id,
+                )
+                redis_client.setex(
+                    f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}",
+                    IMPORT_INFO_REDIS_EXPIRY,
+                    pending_data.model_dump_json(),
+                )
+
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=status,
+                    pipeline_id=pipeline_id,
+                    imported_dsl_version=imported_version,
+                )
+
+            # Extract dependencies
+            dependencies = data.get("dependencies", [])
+            check_dependencies_pending_data = None
+            if dependencies:
+                check_dependencies_pending_data = [PluginDependency.model_validate(d) for d in dependencies]
+
+            # Create or update pipeline
+            pipeline = self._create_or_update_pipeline(
+                pipeline=pipeline,
+                data=data,
+                account=account,
+                dependencies=check_dependencies_pending_data,
+            )
+            # create dataset
+            name = pipeline.name or "Untitled"
+            description = pipeline.description
+            if icon_info:
+                icon_type = icon_info.icon_type
+                icon = icon_info.icon
+                icon_background = icon_info.icon_background
+                icon_url = icon_info.icon_url
+            else:
+                icon_type = data.get("rag_pipeline", {}).get("icon_type")
+                icon = data.get("rag_pipeline", {}).get("icon")
+                icon_background = data.get("rag_pipeline", {}).get("icon_background")
+                icon_url = data.get("rag_pipeline", {}).get("icon_url")
+            workflow = data.get("workflow", {})
+            graph = workflow.get("graph", {})
+            nodes = graph.get("nodes", [])
+            dataset_id = None
+            for node in nodes:
+                if node.get("data", {}).get("type") == "knowledge-index":
+                    knowledge_configuration = KnowledgeConfiguration(**node.get("data", {}))
+                    if (
+                        dataset
+                        and pipeline.is_published
+                        and dataset.chunk_structure != knowledge_configuration.chunk_structure
+                    ):
+                        raise ValueError("Chunk structure is not compatible with the published pipeline")
+                    if not dataset:
+                        datasets = self._session.query(Dataset).filter_by(tenant_id=account.current_tenant_id).all()
+                        names = [dataset.name for dataset in datasets]
+                        generate_name = generate_incremental_name(names, name)
+                        dataset = Dataset(
+                            tenant_id=account.current_tenant_id,
+                            name=generate_name,
+                            description=description,
+                            icon_info={
+                                "icon_type": icon_type,
+                                "icon": icon,
+                                "icon_background": icon_background,
+                                "icon_url": icon_url,
+                            },
+                            indexing_technique=knowledge_configuration.indexing_technique,
+                            created_by=account.id,
+                            retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
+                            runtime_mode="rag_pipeline",
+                            chunk_structure=knowledge_configuration.chunk_structure,
+                        )
+                    if knowledge_configuration.indexing_technique == "high_quality":
+                        dataset_collection_binding = (
+                            self._session.query(DatasetCollectionBinding)
+                            .where(
+                                DatasetCollectionBinding.provider_name
+                                == knowledge_configuration.embedding_model_provider,
+                                DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
+                                DatasetCollectionBinding.type == "dataset",
+                            )
+                            .order_by(DatasetCollectionBinding.created_at)
+                            .first()
+                        )
+
+                        if not dataset_collection_binding:
+                            dataset_collection_binding = DatasetCollectionBinding(
+                                provider_name=knowledge_configuration.embedding_model_provider,
+                                model_name=knowledge_configuration.embedding_model,
+                                collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
+                                type="dataset",
+                            )
+                            self._session.add(dataset_collection_binding)
+                            self._session.commit()
+                        dataset_collection_binding_id = dataset_collection_binding.id
+                        dataset.collection_binding_id = dataset_collection_binding_id
+                        dataset.embedding_model = knowledge_configuration.embedding_model
+                        dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
+                    elif knowledge_configuration.indexing_technique == "economy":
+                        dataset.keyword_number = knowledge_configuration.keyword_number
+                    dataset.pipeline_id = pipeline.id
+                    self._session.add(dataset)
+                    self._session.commit()
+                    dataset_id = dataset.id
+            if not dataset_id:
+                raise ValueError("DSL is not valid, please check the Knowledge Index node.")
+
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=status,
+                pipeline_id=pipeline.id,
+                dataset_id=dataset_id,
+                imported_dsl_version=imported_version,
+            )
+
+        except yaml.YAMLError as e:
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.FAILED,
+                error=f"Invalid YAML format: {str(e)}",
+            )
+
+        except Exception as e:
+            logger.exception("Failed to import app")
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.FAILED,
+                error=str(e),
+            )
+
+    def confirm_import(self, *, import_id: str, account: Account) -> RagPipelineImportInfo:
+        """
+        Confirm an import that requires confirmation
+        """
+        redis_key = f"{IMPORT_INFO_REDIS_KEY_PREFIX}{import_id}"
+        pending_data = redis_client.get(redis_key)
+
+        if not pending_data:
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.FAILED,
+                error="Import information expired or does not exist",
+            )
+
+        try:
+            if not isinstance(pending_data, str | bytes):
+                return RagPipelineImportInfo(
+                    id=import_id,
+                    status=ImportStatus.FAILED,
+                    error="Invalid import information",
+                )
+            pending_data = RagPipelinePendingData.model_validate_json(pending_data)
+            data = yaml.safe_load(pending_data.yaml_content)
+
+            pipeline = None
+            if pending_data.pipeline_id:
+                stmt = select(Pipeline).where(
+                    Pipeline.id == pending_data.pipeline_id,
+                    Pipeline.tenant_id == account.current_tenant_id,
+                )
+                pipeline = self._session.scalar(stmt)
+
+            # Create or update app
+            pipeline = self._create_or_update_pipeline(
+                pipeline=pipeline,
+                data=data,
+                account=account,
+            )
+            dataset = pipeline.retrieve_dataset(session=self._session)
+
+            # create dataset
+            name = pipeline.name
+            description = pipeline.description
+            icon_type = data.get("rag_pipeline", {}).get("icon_type")
+            icon = data.get("rag_pipeline", {}).get("icon")
+            icon_background = data.get("rag_pipeline", {}).get("icon_background")
+            icon_url = data.get("rag_pipeline", {}).get("icon_url")
+            workflow = data.get("workflow", {})
+            graph = workflow.get("graph", {})
+            nodes = graph.get("nodes", [])
+            dataset_id = None
+            for node in nodes:
+                if node.get("data", {}).get("type") == "knowledge-index":
+                    knowledge_configuration = KnowledgeConfiguration(**node.get("data", {}))
+                    if not dataset:
+                        dataset = Dataset(
+                            tenant_id=account.current_tenant_id,
+                            name=name,
+                            description=description,
+                            icon_info={
+                                "icon_type": icon_type,
+                                "icon": icon,
+                                "icon_background": icon_background,
+                                "icon_url": icon_url,
+                            },
+                            indexing_technique=knowledge_configuration.indexing_technique,
+                            created_by=account.id,
+                            retrieval_model=knowledge_configuration.retrieval_model.model_dump(),
+                            runtime_mode="rag_pipeline",
+                            chunk_structure=knowledge_configuration.chunk_structure,
+                        )
+                    else:
+                        dataset.indexing_technique = knowledge_configuration.indexing_technique
+                        dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
+                        dataset.runtime_mode = "rag_pipeline"
+                        dataset.chunk_structure = knowledge_configuration.chunk_structure
+                    if knowledge_configuration.indexing_technique == "high_quality":
+                        dataset_collection_binding = (
+                            self._session.query(DatasetCollectionBinding)
+                            .where(
+                                DatasetCollectionBinding.provider_name
+                                == knowledge_configuration.embedding_model_provider,
+                                DatasetCollectionBinding.model_name == knowledge_configuration.embedding_model,
+                                DatasetCollectionBinding.type == "dataset",
+                            )
+                            .order_by(DatasetCollectionBinding.created_at)
+                            .first()
+                        )
+
+                        if not dataset_collection_binding:
+                            dataset_collection_binding = DatasetCollectionBinding(
+                                provider_name=knowledge_configuration.embedding_model_provider,
+                                model_name=knowledge_configuration.embedding_model,
+                                collection_name=Dataset.gen_collection_name_by_id(str(uuid.uuid4())),
+                                type="dataset",
+                            )
+                            self._session.add(dataset_collection_binding)
+                            self._session.commit()
+                        dataset_collection_binding_id = dataset_collection_binding.id
+                        dataset.collection_binding_id = dataset_collection_binding_id
+                        dataset.embedding_model = knowledge_configuration.embedding_model
+                        dataset.embedding_model_provider = knowledge_configuration.embedding_model_provider
+                    elif knowledge_configuration.indexing_technique == "economy":
+                        dataset.keyword_number = knowledge_configuration.keyword_number
+                    dataset.pipeline_id = pipeline.id
+                    self._session.add(dataset)
+                    self._session.commit()
+                    dataset_id = dataset.id
+            if not dataset_id:
+                raise ValueError("DSL is not valid, please check the Knowledge Index node.")
+
+            # Delete import info from Redis
+            redis_client.delete(redis_key)
+
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.COMPLETED,
+                pipeline_id=pipeline.id,
+                dataset_id=dataset_id,
+                current_dsl_version=CURRENT_DSL_VERSION,
+                imported_dsl_version=data.get("version", "0.1.0"),
+            )
+
+        except Exception as e:
+            logger.exception("Error confirming import")
+            return RagPipelineImportInfo(
+                id=import_id,
+                status=ImportStatus.FAILED,
+                error=str(e),
+            )
+
+    def check_dependencies(
+        self,
+        *,
+        pipeline: Pipeline,
+    ) -> CheckDependenciesResult:
+        """Check dependencies"""
+        # Get dependencies from Redis
+        redis_key = f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}"
+        dependencies = redis_client.get(redis_key)
+        if not dependencies:
+            return CheckDependenciesResult()
+
+        # Extract dependencies
+        dependencies = CheckDependenciesPendingData.model_validate_json(dependencies)
+
+        # Get leaked dependencies
+        leaked_dependencies = DependenciesAnalysisService.get_leaked_dependencies(
+            tenant_id=pipeline.tenant_id, dependencies=dependencies.dependencies
+        )
+        return CheckDependenciesResult(
+            leaked_dependencies=leaked_dependencies,
+        )
+
+    def _create_or_update_pipeline(
+        self,
+        *,
+        pipeline: Pipeline | None,
+        data: dict,
+        account: Account,
+        dependencies: list[PluginDependency] | None = None,
+    ) -> Pipeline:
+        """Create a new app or update an existing one."""
+        if not account.current_tenant_id:
+            raise ValueError("Tenant id is required")
+        pipeline_data = data.get("rag_pipeline", {})
+        # Initialize pipeline based on mode
+        workflow_data = data.get("workflow")
+        if not workflow_data or not isinstance(workflow_data, dict):
+            raise ValueError("Missing workflow data for rag pipeline")
+
+        environment_variables_list = workflow_data.get("environment_variables", [])
+        environment_variables = [
+            variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
+        ]
+        conversation_variables_list = workflow_data.get("conversation_variables", [])
+        conversation_variables = [
+            variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
+        ]
+        rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
+
+        graph = workflow_data.get("graph", {})
+        for node in graph.get("nodes", []):
+            if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL.value:
+                dataset_ids = node["data"].get("dataset_ids", [])
+                node["data"]["dataset_ids"] = [
+                    decrypted_id
+                    for dataset_id in dataset_ids
+                    if (
+                        decrypted_id := self.decrypt_dataset_id(
+                            encrypted_data=dataset_id,
+                            tenant_id=account.current_tenant_id,
+                        )
+                    )
+                ]
+
+        if pipeline:
+            # Update existing pipeline
+            pipeline.name = pipeline_data.get("name", pipeline.name)
+            pipeline.description = pipeline_data.get("description", pipeline.description)
+            pipeline.updated_by = account.id
+
+        else:
+            if account.current_tenant_id is None:
+                raise ValueError("Current tenant is not set")
+
+            # Create new app
+            pipeline = Pipeline()
+            pipeline.id = str(uuid4())
+            pipeline.tenant_id = account.current_tenant_id
+            pipeline.name = pipeline_data.get("name", "")
+            pipeline.description = pipeline_data.get("description", "")
+            pipeline.created_by = account.id
+            pipeline.updated_by = account.id
+
+            self._session.add(pipeline)
+            self._session.commit()
+        # save dependencies
+        if dependencies:
+            redis_client.setex(
+                f"{CHECK_DEPENDENCIES_REDIS_KEY_PREFIX}{pipeline.id}",
+                IMPORT_INFO_REDIS_EXPIRY,
+                CheckDependenciesPendingData(pipeline_id=pipeline.id, dependencies=dependencies).model_dump_json(),
+            )
+        workflow = (
+            self._session.query(Workflow)
+            .where(
+                Workflow.tenant_id == pipeline.tenant_id,
+                Workflow.app_id == pipeline.id,
+                Workflow.version == "draft",
+            )
+            .first()
+        )
+
+        # create draft workflow if not found
+        if not workflow:
+            workflow = Workflow(
+                tenant_id=pipeline.tenant_id,
+                app_id=pipeline.id,
+                features="{}",
+                type=WorkflowType.RAG_PIPELINE.value,
+                version="draft",
+                graph=json.dumps(graph),
+                created_by=account.id,
+                environment_variables=environment_variables,
+                conversation_variables=conversation_variables,
+                rag_pipeline_variables=rag_pipeline_variables_list,
+            )
+            self._session.add(workflow)
+            self._session.flush()
+            pipeline.workflow_id = workflow.id
+        else:
+            workflow.graph = json.dumps(graph)
+            workflow.updated_by = account.id
+            workflow.updated_at = datetime.now(UTC).replace(tzinfo=None)
+            workflow.environment_variables = environment_variables
+            workflow.conversation_variables = conversation_variables
+            workflow.rag_pipeline_variables = rag_pipeline_variables_list
+        # commit db session changes
+        self._session.commit()
+
+        return pipeline
+
+    def export_rag_pipeline_dsl(self, pipeline: Pipeline, include_secret: bool = False) -> str:
+        """
+        Export pipeline
+        :param pipeline: Pipeline instance
+        :param include_secret: Whether include secret variable
+        :return:
+        """
+        dataset = pipeline.retrieve_dataset(session=self._session)
+        if not dataset:
+            raise ValueError("Missing dataset for rag pipeline")
+        icon_info = dataset.icon_info
+        export_data = {
+            "version": CURRENT_DSL_VERSION,
+            "kind": "rag_pipeline",
+            "rag_pipeline": {
+                "name": dataset.name,
+                "icon": icon_info.get("icon", "📙") if icon_info else "📙",
+                "icon_type": icon_info.get("icon_type", "emoji") if icon_info else "emoji",
+                "icon_background": icon_info.get("icon_background", "#FFEAD5") if icon_info else "#FFEAD5",
+                "icon_url": icon_info.get("icon_url") if icon_info else None,
+                "description": pipeline.description,
+            },
+        }
+
+        self._append_workflow_export_data(export_data=export_data, pipeline=pipeline, include_secret=include_secret)
+
+        return yaml.dump(export_data, allow_unicode=True)  # type: ignore
+
+    def _append_workflow_export_data(self, *, export_data: dict, pipeline: Pipeline, include_secret: bool) -> None:
+        """
+        Append workflow export data
+        :param export_data: export data
+        :param pipeline: Pipeline instance
+        """
+
+        workflow = (
+            self._session.query(Workflow)
+            .where(
+                Workflow.tenant_id == pipeline.tenant_id,
+                Workflow.app_id == pipeline.id,
+                Workflow.version == "draft",
+            )
+            .first()
+        )
+        if not workflow:
+            raise ValueError("Missing draft workflow configuration, please check.")
+
+        workflow_dict = workflow.to_dict(include_secret=include_secret)
+        for node in workflow_dict.get("graph", {}).get("nodes", []):
+            if node.get("data", {}).get("type", "") == NodeType.KNOWLEDGE_RETRIEVAL.value:
+                dataset_ids = node["data"].get("dataset_ids", [])
+                node["data"]["dataset_ids"] = [
+                    self.encrypt_dataset_id(dataset_id=dataset_id, tenant_id=pipeline.tenant_id)
+                    for dataset_id in dataset_ids
+                ]
+        export_data["workflow"] = workflow_dict
+        dependencies = self._extract_dependencies_from_workflow(workflow)
+        export_data["dependencies"] = [
+            jsonable_encoder(d.model_dump())
+            for d in DependenciesAnalysisService.generate_dependencies(
+                tenant_id=pipeline.tenant_id, dependencies=dependencies
+            )
+        ]
+
+    def _extract_dependencies_from_workflow(self, workflow: Workflow) -> list[str]:
+        """
+        Extract dependencies from workflow
+        :param workflow: Workflow instance
+        :return: dependencies list format like ["langgenius/google"]
+        """
+        graph = workflow.graph_dict
+        dependencies = self._extract_dependencies_from_workflow_graph(graph)
+        return dependencies
+
+    def _extract_dependencies_from_workflow_graph(self, graph: Mapping) -> list[str]:
+        """
+        Extract dependencies from workflow graph
+        :param graph: Workflow graph
+        :return: dependencies list format like ["langgenius/google"]
+        """
+        dependencies = []
+        for node in graph.get("nodes", []):
+            try:
+                typ = node.get("data", {}).get("type")
+                match typ:
+                    case NodeType.TOOL.value:
+                        tool_entity = ToolNodeData(**node["data"])
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_tool_dependency(tool_entity.provider_id),
+                        )
+                    case NodeType.DATASOURCE.value:
+                        datasource_entity = DatasourceNodeData(**node["data"])
+                        if datasource_entity.provider_type != "local_file":
+                            dependencies.append(datasource_entity.plugin_id)
+                    case NodeType.LLM.value:
+                        llm_entity = LLMNodeData(**node["data"])
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_model_provider_dependency(llm_entity.model.provider),
+                        )
+                    case NodeType.QUESTION_CLASSIFIER.value:
+                        question_classifier_entity = QuestionClassifierNodeData(**node["data"])
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                question_classifier_entity.model.provider
+                            ),
+                        )
+                    case NodeType.PARAMETER_EXTRACTOR.value:
+                        parameter_extractor_entity = ParameterExtractorNodeData(**node["data"])
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                parameter_extractor_entity.model.provider
+                            ),
+                        )
+                    case NodeType.KNOWLEDGE_INDEX.value:
+                        knowledge_index_entity = KnowledgeConfiguration(**node["data"])
+                        if knowledge_index_entity.indexing_technique == "high_quality":
+                            if knowledge_index_entity.embedding_model_provider:
+                                dependencies.append(
+                                    DependenciesAnalysisService.analyze_model_provider_dependency(
+                                        knowledge_index_entity.embedding_model_provider
+                                    ),
+                                )
+                        if knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model":
+                            if knowledge_index_entity.retrieval_model.reranking_enable:
+                                if (
+                                    knowledge_index_entity.retrieval_model.reranking_model
+                                    and knowledge_index_entity.retrieval_model.reranking_mode == "reranking_model"
+                                ):
+                                    if knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name:
+                                        dependencies.append(
+                                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                                knowledge_index_entity.retrieval_model.reranking_model.reranking_provider_name
+                                            ),
+                                        )
+                    case NodeType.KNOWLEDGE_RETRIEVAL.value:
+                        knowledge_retrieval_entity = KnowledgeRetrievalNodeData(**node["data"])
+                        if knowledge_retrieval_entity.retrieval_mode == "multiple":
+                            if knowledge_retrieval_entity.multiple_retrieval_config:
+                                if (
+                                    knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
+                                    == "reranking_model"
+                                ):
+                                    if knowledge_retrieval_entity.multiple_retrieval_config.reranking_model:
+                                        dependencies.append(
+                                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                                knowledge_retrieval_entity.multiple_retrieval_config.reranking_model.provider
+                                            ),
+                                        )
+                                elif (
+                                    knowledge_retrieval_entity.multiple_retrieval_config.reranking_mode
+                                    == "weighted_score"
+                                ):
+                                    if knowledge_retrieval_entity.multiple_retrieval_config.weights:
+                                        vector_setting = (
+                                            knowledge_retrieval_entity.multiple_retrieval_config.weights.vector_setting
+                                        )
+                                        dependencies.append(
+                                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                                vector_setting.embedding_provider_name
+                                            ),
+                                        )
+                        elif knowledge_retrieval_entity.retrieval_mode == "single":
+                            model_config = knowledge_retrieval_entity.single_retrieval_config
+                            if model_config:
+                                dependencies.append(
+                                    DependenciesAnalysisService.analyze_model_provider_dependency(
+                                        model_config.model.provider
+                                    ),
+                                )
+                    case _:
+                        # TODO: Handle default case or unknown node types
+                        pass
+            except Exception as e:
+                logger.exception("Error extracting node dependency", exc_info=e)
+
+        return dependencies
+
+    @classmethod
+    def _extract_dependencies_from_model_config(cls, model_config: Mapping) -> list[str]:
+        """
+        Extract dependencies from model config
+        :param model_config: model config dict
+        :return: dependencies list format like ["langgenius/google"]
+        """
+        dependencies = []
+
+        try:
+            # completion model
+            model_dict = model_config.get("model", {})
+            if model_dict:
+                dependencies.append(
+                    DependenciesAnalysisService.analyze_model_provider_dependency(model_dict.get("provider", ""))
+                )
+
+            # reranking model
+            dataset_configs = model_config.get("dataset_configs", {})
+            if dataset_configs:
+                for dataset_config in dataset_configs.get("datasets", {}).get("datasets", []):
+                    if dataset_config.get("reranking_model"):
+                        dependencies.append(
+                            DependenciesAnalysisService.analyze_model_provider_dependency(
+                                dataset_config.get("reranking_model", {})
+                                .get("reranking_provider_name", {})
+                                .get("provider")
+                            )
+                        )
+
+            # tools
+            agent_configs = model_config.get("agent_mode", {})
+            if agent_configs:
+                for agent_config in agent_configs.get("tools", []):
+                    dependencies.append(
+                        DependenciesAnalysisService.analyze_tool_dependency(agent_config.get("provider_id"))
+                    )
+
+        except Exception as e:
+            logger.exception("Error extracting model config dependency", exc_info=e)
+
+        return dependencies
+
+    @classmethod
+    def get_leaked_dependencies(cls, tenant_id: str, dsl_dependencies: list[dict]) -> list[PluginDependency]:
+        """
+        Returns the leaked dependencies in current workspace
+        """
+        dependencies = [PluginDependency(**dep) for dep in dsl_dependencies]
+        if not dependencies:
+            return []
+
+        return DependenciesAnalysisService.get_leaked_dependencies(tenant_id=tenant_id, dependencies=dependencies)
+
+    def _generate_aes_key(self, tenant_id: str) -> bytes:
+        """Generate AES key based on tenant_id"""
+        return hashlib.sha256(tenant_id.encode()).digest()
+
+    def encrypt_dataset_id(self, dataset_id: str, tenant_id: str) -> str:
+        """Encrypt dataset_id using AES-CBC mode"""
+        key = self._generate_aes_key(tenant_id)
+        iv = key[:16]
+        cipher = AES.new(key, AES.MODE_CBC, iv)
+        ct_bytes = cipher.encrypt(pad(dataset_id.encode(), AES.block_size))
+        return base64.b64encode(ct_bytes).decode()
+
+    def decrypt_dataset_id(self, encrypted_data: str, tenant_id: str) -> str | None:
+        """AES decryption"""
+        try:
+            key = self._generate_aes_key(tenant_id)
+            iv = key[:16]
+            cipher = AES.new(key, AES.MODE_CBC, iv)
+            pt = unpad(cipher.decrypt(base64.b64decode(encrypted_data)), AES.block_size)
+            return pt.decode()
+        except Exception:
+            return None
+
+    def create_rag_pipeline_dataset(
+        self,
+        tenant_id: str,
+        rag_pipeline_dataset_create_entity: RagPipelineDatasetCreateEntity,
+    ):
+        if rag_pipeline_dataset_create_entity.name:
+            # check if dataset name already exists
+            if (
+                self._session.query(Dataset)
+                .filter_by(name=rag_pipeline_dataset_create_entity.name, tenant_id=tenant_id)
+                .first()
+            ):
+                raise ValueError(f"Dataset with name {rag_pipeline_dataset_create_entity.name} already exists.")
+        else:
+            # generate a random name as Untitled 1 2 3 ...
+            datasets = self._session.query(Dataset).filter_by(tenant_id=tenant_id).all()
+            names = [dataset.name for dataset in datasets]
+            rag_pipeline_dataset_create_entity.name = generate_incremental_name(
+                names,
+                "Untitled",
+            )
+
+        account = cast(Account, current_user)
+        rag_pipeline_import_info: RagPipelineImportInfo = self.import_rag_pipeline(
+            account=account,
+            import_mode=ImportMode.YAML_CONTENT.value,
+            yaml_content=rag_pipeline_dataset_create_entity.yaml_content,
+            dataset=None,
+            dataset_name=rag_pipeline_dataset_create_entity.name,
+            icon_info=rag_pipeline_dataset_create_entity.icon_info,
+        )
+        return {
+            "id": rag_pipeline_import_info.id,
+            "dataset_id": rag_pipeline_import_info.dataset_id,
+            "pipeline_id": rag_pipeline_import_info.pipeline_id,
+            "status": rag_pipeline_import_info.status,
+            "imported_dsl_version": rag_pipeline_import_info.imported_dsl_version,
+            "current_dsl_version": rag_pipeline_import_info.current_dsl_version,
+            "error": rag_pipeline_import_info.error,
+        }
--- a/api/services/rag_pipeline/rag_pipeline_manage_service.py
+++ b/api/services/rag_pipeline/rag_pipeline_manage_service.py
@ -0,0 +1,23 @@
+from core.plugin.entities.plugin_daemon import PluginDatasourceProviderEntity
+from core.plugin.impl.datasource import PluginDatasourceManager
+from services.datasource_provider_service import DatasourceProviderService
+
+
+class RagPipelineManageService:
+    @staticmethod
+    def list_rag_pipeline_datasources(tenant_id: str) -> list[PluginDatasourceProviderEntity]:
+        """
+        list rag pipeline datasources
+        """
+
+        # get all builtin providers
+        manager = PluginDatasourceManager()
+        datasources = manager.fetch_datasource_providers(tenant_id)
+        for datasource in datasources:
+            datasource_provider_service = DatasourceProviderService()
+            credentials = datasource_provider_service.get_datasource_credentials(
+                tenant_id=tenant_id, provider=datasource.provider, plugin_id=datasource.plugin_id
+            )
+            if credentials:
+                datasource.is_authorized = True
+        return datasources
--- a/api/services/rag_pipeline/rag_pipeline_transform_service.py
+++ b/api/services/rag_pipeline/rag_pipeline_transform_service.py
@ -0,0 +1,383 @@
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from uuid import uuid4
+
+import yaml
+from flask_login import current_user
+
+from constants import DOCUMENT_EXTENSIONS
+from core.plugin.impl.plugin import PluginInstaller
+from extensions.ext_database import db
+from factories import variable_factory
+from models.dataset import Dataset, Document, DocumentPipelineExecutionLog, Pipeline
+from models.model import UploadFile
+from models.workflow import Workflow, WorkflowType
+from services.entities.knowledge_entities.rag_pipeline_entities import KnowledgeConfiguration, RetrievalSetting
+from services.plugin.plugin_migration import PluginMigration
+from services.plugin.plugin_service import PluginService
+
+
+class RagPipelineTransformService:
+    def transform_dataset(self, dataset_id: str):
+        dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first()
+        if not dataset:
+            raise ValueError("Dataset not found")
+        if dataset.pipeline_id and dataset.runtime_mode == "rag_pipeline":
+            return {
+                "pipeline_id": dataset.pipeline_id,
+                "dataset_id": dataset_id,
+                "status": "success",
+            }
+        if dataset.provider != "vendor":
+            raise ValueError("External dataset is not supported")
+        datasource_type = dataset.data_source_type
+        indexing_technique = dataset.indexing_technique
+
+        if not datasource_type and not indexing_technique:
+            return self._transfrom_to_empty_pipeline(dataset)
+
+        doc_form = dataset.doc_form
+        if not doc_form:
+            return self._transfrom_to_empty_pipeline(dataset)
+        retrieval_model = dataset.retrieval_model
+        pipeline_yaml = self._get_transform_yaml(doc_form, datasource_type, indexing_technique)
+        # deal dependencies
+        self._deal_dependencies(pipeline_yaml, dataset.tenant_id)
+        # Extract app data
+        workflow_data = pipeline_yaml.get("workflow")
+        if not workflow_data:
+            raise ValueError("Missing workflow data for rag pipeline")
+        graph = workflow_data.get("graph", {})
+        nodes = graph.get("nodes", [])
+        new_nodes = []
+
+        for node in nodes:
+            if (
+                node.get("data", {}).get("type") == "datasource"
+                and node.get("data", {}).get("provider_type") == "local_file"
+            ):
+                node = self._deal_file_extensions(node)
+            if node.get("data", {}).get("type") == "knowledge-index":
+                node = self._deal_knowledge_index(dataset, doc_form, indexing_technique, retrieval_model, node)
+            new_nodes.append(node)
+        if new_nodes:
+            graph["nodes"] = new_nodes
+            workflow_data["graph"] = graph
+            pipeline_yaml["workflow"] = workflow_data
+        # create pipeline
+        pipeline = self._create_pipeline(pipeline_yaml)
+
+        # save chunk structure to dataset
+        if doc_form == "hierarchical_model":
+            dataset.chunk_structure = "hierarchical_model"
+        elif doc_form == "text_model":
+            dataset.chunk_structure = "text_model"
+        else:
+            raise ValueError("Unsupported doc form")
+
+        dataset.runtime_mode = "rag_pipeline"
+        dataset.pipeline_id = pipeline.id
+
+        # deal document data
+        self._deal_document_data(dataset)
+
+        db.session.commit()
+        return {
+            "pipeline_id": pipeline.id,
+            "dataset_id": dataset_id,
+            "status": "success",
+        }
+
+    def _get_transform_yaml(self, doc_form: str, datasource_type: str, indexing_technique: str | None):
+        pipeline_yaml = {}
+        if doc_form == "text_model":
+            match datasource_type:
+                case "upload_file":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.file-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/file-general-high-quality.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.file-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/file-general-economy.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case "notion_import":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.notion-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/notion-general-high-quality.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.notion-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/notion-general-economy.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case "website_crawl":
+                    if indexing_technique == "high_quality":
+                        # get graph from transform.website-crawl-general-high-quality.yml
+                        with open(f"{Path(__file__).parent}/transform/website-crawl-general-high-quality.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                    if indexing_technique == "economy":
+                        # get graph from transform.website-crawl-general-economy.yml
+                        with open(f"{Path(__file__).parent}/transform/website-crawl-general-economy.yml") as f:
+                            pipeline_yaml = yaml.safe_load(f)
+                case _:
+                    raise ValueError("Unsupported datasource type")
+        elif doc_form == "hierarchical_model":
+            match datasource_type:
+                case "upload_file":
+                    # get graph from transform.file-parentchild.yml
+                    with open(f"{Path(__file__).parent}/transform/file-parentchild.yml") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case "notion_import":
+                    # get graph from transform.notion-parentchild.yml
+                    with open(f"{Path(__file__).parent}/transform/notion-parentchild.yml") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case "website_crawl":
+                    # get graph from transform.website-crawl-parentchild.yml
+                    with open(f"{Path(__file__).parent}/transform/website-crawl-parentchild.yml") as f:
+                        pipeline_yaml = yaml.safe_load(f)
+                case _:
+                    raise ValueError("Unsupported datasource type")
+        else:
+            raise ValueError("Unsupported doc form")
+        return pipeline_yaml
+
+    def _deal_file_extensions(self, node: dict):
+        file_extensions = node.get("data", {}).get("fileExtensions", [])
+        if not file_extensions:
+            return node
+        file_extensions = [file_extension.lower() for file_extension in file_extensions]
+        node["data"]["fileExtensions"] = DOCUMENT_EXTENSIONS
+        return node
+
+    def _deal_knowledge_index(
+        self, dataset: Dataset, doc_form: str, indexing_technique: str | None, retrieval_model: dict, node: dict
+    ):
+        knowledge_configuration_dict = node.get("data", {})
+        knowledge_configuration = KnowledgeConfiguration(**knowledge_configuration_dict)
+
+        if indexing_technique == "high_quality":
+            knowledge_configuration.embedding_model = dataset.embedding_model
+            knowledge_configuration.embedding_model_provider = dataset.embedding_model_provider
+        if retrieval_model:
+            retrieval_setting = RetrievalSetting(**retrieval_model)
+            if indexing_technique == "economy":
+                retrieval_setting.search_method = "keyword_search"
+            knowledge_configuration.retrieval_model = retrieval_setting
+        else:
+            dataset.retrieval_model = knowledge_configuration.retrieval_model.model_dump()
+
+        knowledge_configuration_dict.update(knowledge_configuration.model_dump())
+        node["data"] = knowledge_configuration_dict
+        return node
+
+    def _create_pipeline(
+        self,
+        data: dict,
+    ) -> Pipeline:
+        """Create a new app or update an existing one."""
+        pipeline_data = data.get("rag_pipeline", {})
+        # Initialize pipeline based on mode
+        workflow_data = data.get("workflow")
+        if not workflow_data or not isinstance(workflow_data, dict):
+            raise ValueError("Missing workflow data for rag pipeline")
+
+        environment_variables_list = workflow_data.get("environment_variables", [])
+        environment_variables = [
+            variable_factory.build_environment_variable_from_mapping(obj) for obj in environment_variables_list
+        ]
+        conversation_variables_list = workflow_data.get("conversation_variables", [])
+        conversation_variables = [
+            variable_factory.build_conversation_variable_from_mapping(obj) for obj in conversation_variables_list
+        ]
+        rag_pipeline_variables_list = workflow_data.get("rag_pipeline_variables", [])
+
+        graph = workflow_data.get("graph", {})
+
+        # Create new app
+        pipeline = Pipeline()
+        pipeline.id = str(uuid4())
+        pipeline.tenant_id = current_user.current_tenant_id
+        pipeline.name = pipeline_data.get("name", "")
+        pipeline.description = pipeline_data.get("description", "")
+        pipeline.created_by = current_user.id
+        pipeline.updated_by = current_user.id
+        pipeline.is_published = True
+        pipeline.is_public = True
+
+        db.session.add(pipeline)
+        db.session.flush()
+        # create draft workflow
+        draft_workflow = Workflow(
+            tenant_id=pipeline.tenant_id,
+            app_id=pipeline.id,
+            features="{}",
+            type=WorkflowType.RAG_PIPELINE.value,
+            version="draft",
+            graph=json.dumps(graph),
+            created_by=current_user.id,
+            environment_variables=environment_variables,
+            conversation_variables=conversation_variables,
+            rag_pipeline_variables=rag_pipeline_variables_list,
+        )
+        published_workflow = Workflow(
+            tenant_id=pipeline.tenant_id,
+            app_id=pipeline.id,
+            features="{}",
+            type=WorkflowType.RAG_PIPELINE.value,
+            version=str(datetime.now(UTC).replace(tzinfo=None)),
+            graph=json.dumps(graph),
+            created_by=current_user.id,
+            environment_variables=environment_variables,
+            conversation_variables=conversation_variables,
+            rag_pipeline_variables=rag_pipeline_variables_list,
+        )
+        db.session.add(draft_workflow)
+        db.session.add(published_workflow)
+        db.session.flush()
+        pipeline.workflow_id = published_workflow.id
+        db.session.add(pipeline)
+        return pipeline
+
+    def _deal_dependencies(self, pipeline_yaml: dict, tenant_id: str):
+        installer_manager = PluginInstaller()
+        installed_plugins = installer_manager.list_plugins(tenant_id)
+
+        plugin_migration = PluginMigration()
+
+        installed_plugins_ids = [plugin.plugin_id for plugin in installed_plugins]
+        dependencies = pipeline_yaml.get("dependencies", [])
+        need_install_plugin_unique_identifiers = []
+        for dependency in dependencies:
+            if dependency.get("type") == "marketplace":
+                plugin_unique_identifier = dependency.get("value", {}).get("plugin_unique_identifier")
+                plugin_id = plugin_unique_identifier.split(":")[0]
+                if plugin_id not in installed_plugins_ids:
+                    plugin_unique_identifier = plugin_migration._fetch_plugin_unique_identifier(plugin_id)  # type: ignore
+                    if plugin_unique_identifier:
+                        need_install_plugin_unique_identifiers.append(plugin_unique_identifier)
+        if need_install_plugin_unique_identifiers:
+            print(need_install_plugin_unique_identifiers)
+            PluginService.install_from_marketplace_pkg(tenant_id, need_install_plugin_unique_identifiers)
+
+    def _transfrom_to_empty_pipeline(self, dataset: Dataset):
+        pipeline = Pipeline(
+            tenant_id=dataset.tenant_id,
+            name=dataset.name,
+            description=dataset.description,
+            created_by=current_user.id,
+        )
+        db.session.add(pipeline)
+        db.session.flush()
+
+        dataset.pipeline_id = pipeline.id
+        dataset.runtime_mode = "rag_pipeline"
+        dataset.updated_by = current_user.id
+        dataset.updated_at = datetime.now(UTC).replace(tzinfo=None)
+        db.session.add(dataset)
+        db.session.commit()
+        return {
+            "pipeline_id": pipeline.id,
+            "dataset_id": dataset.id,
+            "status": "success",
+        }
+
+    def _deal_document_data(self, dataset: Dataset):
+        file_node_id = "1752479895761"
+        notion_node_id = "1752489759475"
+        jina_node_id = "1752491761974"
+        firecrawl_node_id = "1752565402678"
+
+        documents = db.session.query(Document).where(Document.dataset_id == dataset.id).all()
+
+        for document in documents:
+            data_source_info_dict = document.data_source_info_dict
+            if not data_source_info_dict:
+                continue
+            if document.data_source_type == "upload_file":
+                document.data_source_type = "local_file"
+                file_id = data_source_info_dict.get("upload_file_id")
+                if file_id:
+                    file = db.session.query(UploadFile).where(UploadFile.id == file_id).first()
+                    if file:
+                        data_source_info = json.dumps(
+                            {
+                                "real_file_id": file_id,
+                                "name": file.name,
+                                "size": file.size,
+                                "extension": file.extension,
+                                "mime_type": file.mime_type,
+                                "url": "",
+                                "transfer_method": "local_file",
+                            }
+                        )
+                        document.data_source_info = data_source_info
+                        document_pipeline_execution_log = DocumentPipelineExecutionLog(
+                            document_id=document.id,
+                            pipeline_id=dataset.pipeline_id,
+                            datasource_type="local_file",
+                            datasource_info=data_source_info,
+                            input_data={},
+                            created_by=document.created_by,
+                            created_at=document.created_at,
+                            datasource_node_id=file_node_id,
+                        )
+                        db.session.add(document)
+                        db.session.add(document_pipeline_execution_log)
+            elif document.data_source_type == "notion_import":
+                document.data_source_type = "online_document"
+                data_source_info = json.dumps(
+                    {
+                        "workspace_id": data_source_info_dict.get("notion_workspace_id"),
+                        "page": {
+                            "page_id": data_source_info_dict.get("notion_page_id"),
+                            "page_name": document.name,
+                            "page_icon": data_source_info_dict.get("notion_page_icon"),
+                            "type": data_source_info_dict.get("type"),
+                            "last_edited_time": data_source_info_dict.get("last_edited_time"),
+                            "parent_id": None,
+                        },
+                    }
+                )
+                document.data_source_info = data_source_info
+                document_pipeline_execution_log = DocumentPipelineExecutionLog(
+                    document_id=document.id,
+                    pipeline_id=dataset.pipeline_id,
+                    datasource_type="online_document",
+                    datasource_info=data_source_info,
+                    input_data={},
+                    created_by=document.created_by,
+                    created_at=document.created_at,
+                    datasource_node_id=notion_node_id,
+                )
+                db.session.add(document)
+                db.session.add(document_pipeline_execution_log)
+            elif document.data_source_type == "website_crawl":
+                document.data_source_type = "website_crawl"
+                data_source_info = json.dumps(
+                    {
+                        "source_url": data_source_info_dict.get("url"),
+                        "content": "",
+                        "title": document.name,
+                        "description": "",
+                    }
+                )
+                document.data_source_info = data_source_info
+                if data_source_info_dict.get("provider") == "firecrawl":
+                    datasource_node_id = firecrawl_node_id
+                elif data_source_info_dict.get("provider") == "jinareader":
+                    datasource_node_id = jina_node_id
+                else:
+                    continue
+                document_pipeline_execution_log = DocumentPipelineExecutionLog(
+                    document_id=document.id,
+                    pipeline_id=dataset.pipeline_id,
+                    datasource_type="website_crawl",
+                    datasource_info=data_source_info,
+                    input_data={},
+                    created_by=document.created_by,
+                    created_at=document.created_at,
+                    datasource_node_id=datasource_node_id,
+                )
+                db.session.add(document)
+                db.session.add(document_pipeline_execution_log)
--- a/api/services/rag_pipeline/transform/file-general-economy.yml
+++ b/api/services/rag_pipeline/transform/file-general-economy.yml
@ -0,0 +1,709 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: file-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752482151668-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: ファイル
+            pt_BR: arquivo
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify Extractor
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify Extractor
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: IF/ELSE
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos blocos.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: DDelimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: O comprimento de sobreposição dos fragmentos
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Comprimento de sobreposição do bloco
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Substituir espaços consecutivos, novas linhas e tabulações
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Excluir todos os URLs e endereços de e-mail
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Excluir todos os URLs e endereços de e-mail
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 701.4999626224237
+      y: 128.33739021504016
+      zoom: 0.48941689643726966
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/file-general-high-quality.yml
+++ b/api/services/rag_pipeline/transform/file-general-high-quality.yml
@ -0,0 +1,709 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: file-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752482151668-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1076.4656678451215
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: ファイル
+            pt_BR: arquivo
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify Extractor
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify Extractor
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: IF/ELSE
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -489.57009543377865
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 693.5300771507484
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 701.4999626224237
+      y: 128.33739021504016
+      zoom: 0.48941689643726966
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/file-parentchild.yml
+++ b/api/services/rag_pipeline/transform/file-parentchild.yml
@ -0,0 +1,814 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/dify_extractor:0.0.1@50103421d4e002f059b662d21ad2d7a1cf34869abdbe320299d7e382516ebb1c
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: file-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: if-else
+      id: 1752479895761-source-1752481129417-target
+      source: '1752479895761'
+      sourceHandle: source
+      target: '1752481129417'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: tool
+      id: 1752481129417-24e47cad-f1e2-4f74-9884-3f49d5bb37b7-1752480460682-target
+      source: '1752481129417'
+      sourceHandle: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+      target: '1752480460682'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: if-else
+        targetType: document-extractor
+      id: 1752481129417-false-1752481112180-target
+      source: '1752481129417'
+      sourceHandle: 'false'
+      target: '1752481112180'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: variable-aggregator
+      id: 1752480460682-source-1752482022496-target
+      source: '1752480460682'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: document-extractor
+        targetType: variable-aggregator
+      id: 1752481112180-source-1752482022496-target
+      source: '1752481112180'
+      sourceHandle: source
+      target: '1752482022496'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752482022496-source-1752575473519-target
+      source: '1752482022496'
+      sourceHandle: source
+      target: '1752575473519'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752575473519-source-1752477924228-target
+      source: '1752575473519'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752575473519'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 994.3774545394483
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 994.3774545394483
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: File
+        datasource_name: upload-file
+        datasource_parameters: {}
+        fileExtensions:
+        - txt
+        - markdown
+        - mdx
+        - pdf
+        - html
+        - xlsx
+        - xls
+        - vtt
+        - properties
+        - doc
+        - docx
+        - csv
+        - eml
+        - msg
+        - pptx
+        - xml
+        - epub
+        - ppt
+        - md
+        plugin_id: langgenius/file
+        provider_name: file
+        provider_type: local_file
+        selected: false
+        title: File
+        type: datasource
+      height: 52
+      id: '1752479895761'
+      position:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -839.8603427660498
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            documents:
+              description: the documents extracted from the file
+              items:
+                type: object
+              type: array
+            images:
+              description: The images extracted from the file
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: the file to be parsed(support pdf, ppt, pptx, doc, docx, png, jpg,
+              jpeg)
+            ja_JP: 解析するファイル(pdf, ppt, pptx, doc, docx, png, jpg, jpegをサポート)
+            pt_BR: o arquivo a ser analisado (suporta pdf, ppt, pptx, doc, docx, png,
+              jpg, jpeg)
+            zh_Hans: 用于解析的文件(支持 pdf, ppt, pptx, doc, docx, png, jpg, jpeg)
+          label:
+            en_US: file
+            ja_JP: ファイル
+            pt_BR: arquivo
+            zh_Hans: file
+          llm_description: the file to be parsed (support pdf, ppt, pptx, doc, docx,
+            png, jpg, jpeg)
+          max: null
+          min: null
+          name: file
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: file
+        params:
+          file: ''
+        provider_id: langgenius/dify_extractor/dify_extractor
+        provider_name: langgenius/dify_extractor/dify_extractor
+        provider_type: builtin
+        selected: false
+        title: Dify Extractor
+        tool_configurations: {}
+        tool_description: Dify Extractor
+        tool_label: Dify Extractor
+        tool_name: dify_extractor
+        tool_parameters:
+          file:
+            type: variable
+            value:
+            - '1752479895761'
+            - file
+        type: tool
+      height: 52
+      id: '1752480460682'
+      position:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_array_file: false
+        selected: false
+        title: 文档提取器
+        type: document-extractor
+        variable_selector:
+        - '1752479895761'
+        - file
+      height: 90
+      id: '1752481112180'
+      position:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      positionAbsolute:
+        x: -108.28652292656551
+        y: 390.6576481692478
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        cases:
+        - case_id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          conditions:
+          - comparison_operator: is
+            id: 9da88d93-3ff6-463f-abfd-6bcafbf2554d
+            value: .xlsx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: d0e88f5e-dfe3-4bae-af0c-dbec267500de
+            value: .xls
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: a957e91e-1ed7-4c6b-9c80-2f0948858f1d
+            value: .md
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 870c3c39-8d3f-474a-ab8b-9c0ccf53db73
+            value: .markdown
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: f9541513-1e71-4dc1-9db5-35dc84a39e3c
+            value: .mdx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 4c7f455b-ac20-40ca-9495-6cc44ffcb35d
+            value: .html
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 2e12d9c7-8057-4a09-8851-f9fd1d0718d1
+            value: .htm
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 73a995a9-d8b9-4aef-89f7-306e2ddcbce2
+            value: .docx
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: 8a2e8772-0426-458b-a1f9-9eaaec0f27c8
+            value: .csv
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          - comparison_operator: is
+            id: aa2cb6b6-a2fc-462a-a9f5-c9c3f33a1602
+            value: .txt
+            varType: file
+            variable_selector:
+            - '1752479895761'
+            - file
+            - extension
+          id: 24e47cad-f1e2-4f74-9884-3f49d5bb37b7
+          logical_operator: or
+        selected: false
+        title: IF/ELSE
+        type: if-else
+      height: 358
+      id: '1752481129417'
+      position:
+        x: -512.2335487893622
+        y: 251.3910724383104
+      positionAbsolute:
+        x: -512.2335487893622
+        y: 251.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        advanced_settings:
+          group_enabled: false
+          groups:
+          - groupId: f4cf07b4-914d-4544-8ef8-0c5d9e4f21a7
+            group_name: Group1
+            output_type: string
+            variables:
+            - - '1752481112180'
+              - text
+            - - '1752480460682'
+              - text
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752481112180'
+          - text
+        - - '1752480460682'
+          - text
+      height: 129
+      id: '1752482022496'
+      position:
+        x: 319.441649575055
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 319.441649575055
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: 入力テキスト
+            pt_BR: Texto de entrada
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: チャンク分割の最大長
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: 最大長
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: チャンク分割に使用する区切り文字
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: チャンク区切り文字
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: サブチャンク分割の最大長
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: サブチャンク最大長
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: サブチャンク分割に使用する区切り文字
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: サブチャンキング用セパレーター
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
+              を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: 親子モード
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: 段落
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: 全文
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: テキスト内の余分なスペースを削除するかどうか
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: 余分なスペースを削除
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: URLとメールアドレスを削除
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parentchild_chunker/parentchild_chunker
+        provider_name: langgenius/parentchild_chunker/parentchild_chunker
+        provider_type: builtin
+        selected: false
+        title: Parent-child Chunker
+        tool_configurations: {}
+        tool_description: Parent-child Chunk Structure
+        tool_label: Parent-child Chunker
+        tool_name: parentchild_chunker
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752482022496.output#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752575473519'
+      position:
+        x: 637.9241611063885
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 637.9241611063885
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: 948.6766333808323
+      y: -102.06757184183238
+      zoom: 0.8375774577380971
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 256
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 256
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/notion-general-economy.yml
+++ b/api/services/rag_pipeline/transform/notion-general-economy.yml
@ -0,0 +1,400 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: notion-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752482151668-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion_datasource
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -838.569649323166
+      y: -168.94656489167426
+      zoom: 1.286925643857699
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/notion-general-high-quality.yml
+++ b/api/services/rag_pipeline/transform/notion-general-high-quality.yml
@ -0,0 +1,400 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: notion-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752482151668-source-1752477924228-target
+      source: '1752482151668'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752482151668-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752482151668'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752482151668'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1444.5503479271906
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752482151668'
+      position:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1063.6922916384628
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion_datasource
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -838.569649323166
+      y: -168.94656489167426
+      zoom: 1.286925643857699
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Chunk overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/notion-parentchild.yml
+++ b/api/services/rag_pipeline/transform/notion-parentchild.yml
@ -0,0 +1,506 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/notion_datasource:0.0.1@2dd49c2c3ffff976be8d22efb1ac0f63522a8d0f24ef8c44729d0a50a94ec039
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: notion-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: tool
+      id: 1752489759475-source-1752490343805-target
+      source: '1752489759475'
+      sourceHandle: source
+      target: '1752490343805'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752490343805-source-1752477924228-target
+      source: '1752490343805'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752490343805'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 1486.2052698032674
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1486.2052698032674
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Notion数据源
+        datasource_name: notion_datasource
+        datasource_parameters: {}
+        plugin_id: langgenius/notion_datasource
+        provider_name: notion_datasource
+        provider_type: online_document
+        selected: false
+        title: Notion数据源
+        type: datasource
+      height: 52
+      id: '1752489759475'
+      position:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 736.9082104000458
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: 入力テキスト
+            pt_BR: Texto de entrada
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: チャンク分割の最大長
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: 最大長
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: チャンク分割に使用する区切り文字
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: チャンク区切り文字
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: サブチャンク分割の最大長
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: サブチャンク最大長
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: サブチャンク分割に使用する区切り文字
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: サブチャンキング用セパレーター
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
+              を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: 親子モード
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: 段落
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: 全文
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: テキスト内の余分なスペースを削除するかどうか
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: 余分なスペースを削除
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: URLとメールアドレスを削除
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parentchild_chunker/parentchild_chunker
+        provider_name: langgenius/parentchild_chunker/parentchild_chunker
+        provider_type: builtin
+        selected: true
+        title: Parent-child Chunker
+        tool_configurations: {}
+        tool_description: Parent-child Chunk Structure
+        tool_label: Parent-child Chunker
+        tool_name: parentchild_chunker
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752489759475.content#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752490343805'
+      position:
+        x: 1077.0240183162543
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1077.0240183162543
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -487.2912544090391
+      y: -54.7029301848807
+      zoom: 0.9994011715768695
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 199
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/website-crawl-general-economy.yml
+++ b/api/services/rag_pipeline/transform/website-crawl-general-economy.yml
@ -0,0 +1,674 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: website-crawl-general-economy
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752569675978-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752569675978'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752569675978-source-1752477924228-target
+      source: '1752569675978'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752569675978'
+        - result
+        indexing_technique: economy
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: keyword_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: true
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jina
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752569675978'
+      position:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -707.721097109337
+      y: -93.07807382100896
+      zoom: 0.9350632198875476
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 50
+    label: chunk_overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Setting the chunk overlap can maintain the semantic relevance between
+      them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
+      maximum chunk size.
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: replace_consecutive_spaces
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml
+++ b/api/services/rag_pipeline/transform/website-crawl-general-high-quality.yml
@ -0,0 +1,674 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/general_chunker:0.0.1@e3da408b7277866404c3f884d599261f9d0b9003ea4ef7eb3b64489bdf39d18b
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: '#FFF4ED'
+  icon_type: emoji
+  name: website-crawl-general-high-quality
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752569675978-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752569675978'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752569675978-source-1752477924228-target
+      source: '1752569675978'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: text_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752569675978'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2140.4053851189346
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jina
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: The result of the general chunk tool.
+              properties:
+                general_chunks:
+                  items:
+                    description: The chunk of the text.
+                    type: string
+                  type: array
+              type: object
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input Variable
+            ja_JP: 入力変数
+            pt_BR: Variável de entrada
+            zh_Hans: 输入变量
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_variable
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The delimiter of the chunks.
+            ja_JP: チャンクの区切り記号。
+            pt_BR: O delimitador dos pedaços.
+            zh_Hans: 块的分隔符。
+          label:
+            en_US: Delimiter
+            ja_JP: 区切り記号
+            pt_BR: Delimitador
+            zh_Hans: 分隔符
+          llm_description: The delimiter of the chunks, the format of the delimiter
+            must be a string.
+          max: null
+          min: null
+          name: delimiter
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The maximum chunk length.
+            ja_JP: 最大長のチャンク。
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度。
+          label:
+            en_US: Maximum Chunk Length
+            ja_JP: チャンク最大長
+            pt_BR: O comprimento máximo do bloco
+            zh_Hans: 最大块的长度
+          llm_description: The maximum chunk length, the format of the chunk size
+            must be an integer.
+          max: null
+          min: null
+          name: max_chunk_length
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The chunk overlap length.
+            ja_JP: チャンクの重複長。
+            pt_BR: The chunk overlap length.
+            zh_Hans: 块的重叠长度。
+          label:
+            en_US: Chunk Overlap Length
+            ja_JP: チャンク重複長
+            pt_BR: Chunk Overlap Length
+            zh_Hans: 块的重叠长度
+          llm_description: The chunk overlap length, the format of the chunk overlap
+            length must be an integer.
+          max: null
+          min: null
+          name: chunk_overlap_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Replace consecutive spaces, newlines and tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace consecutive spaces, newlines and tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          label:
+            en_US: Replace Consecutive Spaces, Newlines and Tabs
+            ja_JP: 連続のスペース、改行、まだはタブを置換する
+            pt_BR: Replace Consecutive Spaces, Newlines and Tabs
+            zh_Hans: 替换连续的空格、换行符和制表符
+          llm_description: Replace consecutive spaces, newlines and tabs, the format
+            of the replace must be a boolean.
+          max: null
+          min: null
+          name: replace_consecutive_spaces_newlines_tabs
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: Delete all URLs and email addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete all URLs and email addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          label:
+            en_US: Delete All URLs and Email Addresses
+            ja_JP: すべてのURLとメールアドレスを削除する
+            pt_BR: Delete All URLs and Email Addresses
+            zh_Hans: 删除所有URL和电子邮件地址
+          llm_description: Delete all URLs and email addresses, the format of the
+            delete must be a boolean.
+          max: null
+          min: null
+          name: delete_all_urls_and_email_addresses
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          chunk_overlap_length: ''
+          delete_all_urls_and_email_addresses: ''
+          delimiter: ''
+          input_variable: ''
+          max_chunk_length: ''
+          replace_consecutive_spaces_newlines_tabs: ''
+        provider_id: langgenius/general_chunker/general_chunker
+        provider_name: langgenius/general_chunker/general_chunker
+        provider_type: builtin
+        selected: false
+        title: General Chunker
+        tool_configurations: {}
+        tool_description: A tool for general text chunking mode, the chunks retrieved and recalled are the same.
+        tool_label: General Chunker
+        tool_name: general_chunker  
+        tool_parameters:
+          chunk_overlap_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - chunk_overlap
+          delete_all_urls_and_email_addresses:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          delimiter:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          input_variable:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_chunk_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          replace_consecutive_spaces_newlines_tabs:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+        type: tool
+      height: 52
+      id: '1752569675978'
+      position:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1807.4306671642219
+        y: 281.3910724383104
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -707.721097109337
+      y: -93.07807382100896
+      zoom: 0.9350632198875476
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: Delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 50
+    label: chunk_overlap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Setting the chunk overlap can maintain the semantic relevance between
+      them, enhancing the retrieve effect. It is recommended to set 10%–25% of the
+      maximum chunk size.
+    type: number
+    unit: characters
+    variable: chunk_overlap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: replace_consecutive_spaces
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/rag_pipeline/transform/website-crawl-parentchild.yml
+++ b/api/services/rag_pipeline/transform/website-crawl-parentchild.yml
@ -0,0 +1,779 @@
+dependencies:
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/parentchild_chunker:0.0.1@b1a28a27e33fec442ce494da2a7814edd7eb9d646c81f38bccfcf1133d486e40
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/firecrawl_datasource:0.0.1@f7aed0a26df0e5f4b9555371b5c9fa6db3c7dcf6a46dd1583245697bd90a539a
+- current_identifier: null
+  type: marketplace
+  value:
+    plugin_unique_identifier: langgenius/jina_datasource:0.0.1@cf23afb2c3eeccc5a187763a1947f583f0bb10aa56461e512ac4141bf930d608
+kind: rag_pipeline
+rag_pipeline:
+  description: ''
+  icon: 📙
+  icon_background: ''
+  icon_type: emoji
+  name: website-crawl-parentchild
+version: 0.1.0
+workflow:
+  conversation_variables: []
+  environment_variables: []
+  features: {}
+  graph:
+    edges:
+    - data:
+        isInLoop: false
+        sourceType: tool
+        targetType: knowledge-index
+      id: 1752490343805-source-1752477924228-target
+      source: '1752490343805'
+      sourceHandle: source
+      target: '1752477924228'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752491761974-source-1752565435219-target
+      source: '1752491761974'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInIteration: false
+        isInLoop: false
+        sourceType: variable-aggregator
+        targetType: tool
+      id: 1752565435219-source-1752490343805-target
+      source: '1752565435219'
+      sourceHandle: source
+      target: '1752490343805'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    - data:
+        isInLoop: false
+        sourceType: datasource
+        targetType: variable-aggregator
+      id: 1752565402678-source-1752565435219-target
+      source: '1752565402678'
+      sourceHandle: source
+      target: '1752565435219'
+      targetHandle: target
+      type: custom
+      zIndex: 0
+    nodes:
+    - data:
+        chunk_structure: hierarchical_model
+        embedding_model: text-embedding-ada-002
+        embedding_model_provider: langgenius/openai/openai
+        index_chunk_variable_selector:
+        - '1752490343805'
+        - result
+        indexing_technique: high_quality
+        keyword_number: 10
+        retrieval_model:
+          score_threshold: 0.5
+          score_threshold_enabled: false
+          search_method: semantic_search
+          top_k: 3
+          vector_setting:
+            embedding_model_name: text-embedding-ada-002
+            embedding_provider_name: langgenius/openai/openai
+        selected: false
+        title: Knowledge Base
+        type: knowledge-index
+      height: 114
+      id: '1752477924228'
+      position:
+        x: 2215.5544306817387
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 2215.5544306817387
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        is_team_authorization: true
+        output_schema:
+          properties:
+            result:
+              description: Parent child chunks result
+              items:
+                type: object
+              type: array
+          type: object
+        paramSchemas:
+        - auto_generate: null
+          default: null
+          form: llm
+          human_description:
+            en_US: The text you want to chunk.
+            ja_JP: チャンク化したいテキスト。
+            pt_BR: O texto que você deseja dividir.
+            zh_Hans: 你想要分块的文本。
+          label:
+            en_US: Input text
+            ja_JP: 入力テキスト
+            pt_BR: Texto de entrada
+            zh_Hans: 输入文本
+          llm_description: The text you want to chunk.
+          max: null
+          min: null
+          name: input_text
+          options: []
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 1024
+          form: llm
+          human_description:
+            en_US: Maximum length for chunking
+            ja_JP: チャンク分割の最大長
+            pt_BR: Comprimento máximo para divisão
+            zh_Hans: 用于分块的最大长度
+          label:
+            en_US: Maximum Length
+            ja_JP: 最大長
+            pt_BR: Comprimento Máximo
+            zh_Hans: 最大长度
+          llm_description: Maximum length allowed per chunk
+          max: null
+          min: null
+          name: max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '
+
+
+            '
+          form: llm
+          human_description:
+            en_US: Separator used for chunking
+            ja_JP: チャンク分割に使用する区切り文字
+            pt_BR: Separador usado para divisão
+            zh_Hans: 用于分块的分隔符
+          label:
+            en_US: Chunk Separator
+            ja_JP: チャンク区切り文字
+            pt_BR: Separador de Divisão
+            zh_Hans: 分块分隔符
+          llm_description: The separator used to split chunks
+          max: null
+          min: null
+          name: separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: 512
+          form: llm
+          human_description:
+            en_US: Maximum length for subchunking
+            ja_JP: サブチャンク分割の最大長
+            pt_BR: Comprimento máximo para subdivisão
+            zh_Hans: 用于子分块的最大长度
+          label:
+            en_US: Subchunk Maximum Length
+            ja_JP: サブチャンク最大長
+            pt_BR: Comprimento Máximo de Subdivisão
+            zh_Hans: 子分块最大长度
+          llm_description: Maximum length allowed per subchunk
+          max: null
+          min: null
+          name: subchunk_max_length
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: number
+        - auto_generate: null
+          default: '. '
+          form: llm
+          human_description:
+            en_US: Separator used for subchunking
+            ja_JP: サブチャンク分割に使用する区切り文字
+            pt_BR: Separador usado para subdivisão
+            zh_Hans: 用于子分块的分隔符
+          label:
+            en_US: Subchunk Separator
+            ja_JP: サブチャンキング用セパレーター
+            pt_BR: Separador de Subdivisão
+            zh_Hans: 子分块分隔符
+          llm_description: The separator used to split subchunks
+          max: null
+          min: null
+          name: subchunk_separator
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: string
+        - auto_generate: null
+          default: paragraph
+          form: llm
+          human_description:
+            en_US: Split text into paragraphs based on separator and maximum chunk
+              length, using split text as parent block or entire document as parent
+              block and directly retrieve.
+            ja_JP: セパレーターと最大チャンク長に基づいてテキストを段落に分割し、分割されたテキスト
+              を親ブロックとして使用するか、文書全体を親ブロックとして使用して直接取得します。
+            pt_BR: Dividir texto em parágrafos com base no separador e no comprimento
+              máximo do bloco, usando o texto dividido como bloco pai ou documento
+              completo como bloco pai e diretamente recuperá-lo.
+            zh_Hans: 根据分隔符和最大块长度将文本拆分为段落，使用拆分文本作为检索的父块或整个文档用作父块并直接检索。
+          label:
+            en_US: Parent Mode
+            ja_JP: 親子モード
+            pt_BR: Modo Pai
+            zh_Hans: 父块模式
+          llm_description: Split text into paragraphs based on separator and maximum
+            chunk length, using split text as parent block or entire document as parent
+            block and directly retrieve.
+          max: null
+          min: null
+          name: parent_mode
+          options:
+          - icon: ''
+            label:
+              en_US: Paragraph
+              ja_JP: 段落
+              pt_BR: Parágrafo
+              zh_Hans: 段落
+            value: paragraph
+          - icon: ''
+            label:
+              en_US: Full Document
+              ja_JP: 全文
+              pt_BR: Documento Completo
+              zh_Hans: 全文
+            value: full_doc
+          placeholder: null
+          precision: null
+          required: true
+          scope: null
+          template: null
+          type: select
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove extra spaces in the text
+            ja_JP: テキスト内の余分なスペースを削除するかどうか
+            pt_BR: Se deve remover espaços extras no texto
+            zh_Hans: 是否移除文本中的多余空格
+          label:
+            en_US: Remove Extra Spaces
+            ja_JP: 余分なスペースを削除
+            pt_BR: Remover Espaços Extras
+            zh_Hans: 移除多余空格
+          llm_description: Whether to remove extra spaces in the text
+          max: null
+          min: null
+          name: remove_extra_spaces
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        - auto_generate: null
+          default: 0
+          form: llm
+          human_description:
+            en_US: Whether to remove URLs and emails in the text
+            ja_JP: テキスト内のURLやメールアドレスを削除するかどうか
+            pt_BR: Se deve remover URLs e e-mails no texto
+            zh_Hans: 是否移除文本中的URL和电子邮件地址
+          label:
+            en_US: Remove URLs and Emails
+            ja_JP: URLとメールアドレスを削除
+            pt_BR: Remover URLs e E-mails
+            zh_Hans: 移除URL和电子邮件地址
+          llm_description: Whether to remove URLs and emails in the text
+          max: null
+          min: null
+          name: remove_urls_emails
+          options: []
+          placeholder: null
+          precision: null
+          required: false
+          scope: null
+          template: null
+          type: boolean
+        params:
+          input_text: ''
+          max_length: ''
+          parent_mode: ''
+          remove_extra_spaces: ''
+          remove_urls_emails: ''
+          separator: ''
+          subchunk_max_length: ''
+          subchunk_separator: ''
+        provider_id: langgenius/parentchild_chunker/parentchild_chunker
+        provider_name: langgenius/parentchild_chunker/parentchild_chunker
+        provider_type: builtin
+        selected: true
+        title: Parent-child Chunker
+        tool_configurations: {}
+        tool_description: Parent-child Chunk Structure
+        tool_label: Parent-child Chunker
+        tool_name: parentchild_chunker
+        tool_parameters:
+          input_text:
+            type: mixed
+            value: '{{#1752565435219.output#}}'
+          max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - max_chunk_length
+          parent_mode:
+            type: variable
+            value:
+            - rag
+            - shared
+            - parent_mode
+          remove_extra_spaces:
+            type: mixed
+            value: '{{#rag.shared.replace_consecutive_spaces#}}'
+          remove_urls_emails:
+            type: mixed
+            value: '{{#rag.shared.delete_urls_email#}}'
+          separator:
+            type: mixed
+            value: '{{#rag.shared.delimiter#}}'
+          subchunk_max_length:
+            type: variable
+            value:
+            - rag
+            - shared
+            - child_max_chunk_length
+          subchunk_separator:
+            type: mixed
+            value: '{{#rag.shared.child_delimiter#}}'
+        type: tool
+      height: 52
+      id: '1752490343805'
+      position:
+        x: 1853.5260563244174
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1853.5260563244174
+        y: 281.3910724383104
+      selected: true
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Jina Reader
+        datasource_name: jina_reader
+        datasource_parameters:
+          crawl_sub_pages:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_crawl_sub_pages#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752491761974'
+            - jina_limit
+          url:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_url#}}'
+          use_sitemap:
+            type: mixed
+            value: '{{#rag.1752491761974.jina_use_sitemap#}}'
+        plugin_id: langgenius/jina_datasource
+        provider_name: jina
+        provider_type: website_crawl
+        selected: false
+        title: Jina Reader
+        type: datasource
+      height: 52
+      id: '1752491761974'
+      position:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        datasource_configurations: {}
+        datasource_label: Firecrawl
+        datasource_name: crawl
+        datasource_parameters:
+          crawl_subpages:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_crawl_sub_pages#}}'
+          exclude_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_exclude_paths#}}'
+          include_paths:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_include_only_paths#}}'
+          limit:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_limit
+          max_depth:
+            type: variable
+            value:
+            - rag
+            - '1752565402678'
+            - firecrawl_max_depth
+          only_main_content:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_extract_main_content#}}'
+          url:
+            type: mixed
+            value: '{{#rag.1752565402678.firecrawl_url#}}'
+        plugin_id: langgenius/firecrawl_datasource
+        provider_name: firecrawl
+        provider_type: website_crawl
+        selected: false
+        title: Firecrawl
+        type: datasource
+      height: 52
+      id: '1752565402678'
+      position:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      positionAbsolute:
+        x: 1067.7526055798794
+        y: 417.32608398342404
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    - data:
+        output_type: string
+        selected: false
+        title: Variable Aggregator
+        type: variable-aggregator
+        variables:
+        - - '1752491761974'
+          - content
+        - - '1752565402678'
+          - content
+      height: 129
+      id: '1752565435219'
+      position:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      positionAbsolute:
+        x: 1505.4306671642219
+        y: 281.3910724383104
+      selected: false
+      sourcePosition: right
+      targetPosition: left
+      type: custom
+      width: 242
+    viewport:
+      x: -826.1791044466438
+      y: -71.91725474841303
+      zoom: 0.9980166672552107
+  rag_pipeline_variables:
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: jina_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: jina_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: jina_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752491761974'
+    default_value: null
+    label: Use sitemap
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: Follow the sitemap to crawl the site. If not, Jina Reader will crawl
+      iteratively based on page relevance, yielding fewer but higher-quality pages.
+    type: checkbox
+    unit: null
+    variable: jina_use_sitemap
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: URL
+    max_length: 256
+    options: []
+    placeholder: https://docs.dify.ai/en/
+    required: true
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_url
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: true
+    label: Crawl sub-pages
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_crawl_sub_pages
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: 10
+    label: Limit
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: null
+    variable: firecrawl_limit
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Max depth
+    max_length: 48
+    options: []
+    placeholder: ''
+    required: false
+    tooltips: Maximum depth to crawl relative to the entered URL. Depth 0 just scrapes
+      the page of the entered url, depth 1 scrapes the url and everything after enteredURL
+      + one /, and so on.
+    type: number
+    unit: null
+    variable: firecrawl_max_depth
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Exclude paths
+    max_length: 256
+    options: []
+    placeholder: blog/*, /about/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_exclude_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: Include only paths
+    max_length: 256
+    options: []
+    placeholder: articles/*
+    required: false
+    tooltips: null
+    type: text-input
+    unit: null
+    variable: firecrawl_include_only_paths
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: '1752565402678'
+    default_value: null
+    label: firecrawl_extract_main_content
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: firecrawl_extract_main_content
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n\n
+    label: delimiter
+    max_length: 100
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 1024
+    label: Maximum chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: \n
+    label: Child delimiter
+    max_length: 199
+    options: []
+    placeholder: null
+    required: true
+    tooltips: A delimiter is the character used to separate text. \n\n is recommended
+      for splitting the original document into large parent chunks. You can also use
+      special delimiters defined by yourself.
+    type: text-input
+    unit: null
+    variable: child_delimiter
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: 512
+    label: Child max chunk length
+    max_length: 48
+    options: []
+    placeholder: null
+    required: true
+    tooltips: null
+    type: number
+    unit: characters
+    variable: child_max_chunk_length
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: paragraph
+    label: Parent mode
+    max_length: 48
+    options:
+    - full_doc
+    - paragraph
+    placeholder: null
+    required: true
+    tooltips: null
+    type: select
+    unit: null
+    variable: parent_mode
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Replace consecutive spaces, newlines and tabs
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: replace_consecutive_spaces
+  - allow_file_extension: null
+    allow_file_upload_methods: null
+    allowed_file_types: null
+    belong_to_node_id: shared
+    default_value: null
+    label: Delete all URLs and email addresses
+    max_length: 48
+    options: []
+    placeholder: null
+    required: false
+    tooltips: null
+    type: checkbox
+    unit: null
+    variable: delete_urls_email
--- a/api/services/tools/builtin_tools_manage_service.py
+++ b/api/services/tools/builtin_tools_manage_service.py
@ -1,6 +1,5 @@
 import json
 import logging
-import re
 from collections.abc import Mapping
 from pathlib import Path
 from typing import Any
@ -10,9 +9,9 @@ from sqlalchemy.orm import Session

 from configs import dify_config
 from constants import HIDDEN_VALUE, UNKNOWN_VALUE
+from core.helper.name_generator import generate_incremental_name
 from core.helper.position_helper import is_filtered
 from core.helper.provider_cache import NoOpProviderCredentialCache, ToolProviderCredentialsCache
-from core.plugin.entities.plugin import ToolProviderID
 from core.tools.builtin_tool.provider import BuiltinToolProviderController
 from core.tools.builtin_tool.providers._positions import BuiltinToolProviderSort
 from core.tools.entities.api_entities import (
@ -30,6 +29,7 @@ from core.tools.utils.encryption import create_provider_encrypter
 from core.tools.utils.system_oauth_encryption import decrypt_system_oauth_params
 from extensions.ext_database import db
 from extensions.ext_redis import redis_client
+from models.provider_ids import ToolProviderID
 from models.tools import BuiltinToolProvider, ToolOAuthSystemClient, ToolOAuthTenantClient
 from services.plugin.plugin_service import PluginService
 from services.tools.tools_transform_service import ToolTransformService
@ -311,42 +311,20 @@ class BuiltinToolManageService:
    def generate_builtin_tool_provider_name(
        session: Session, tenant_id: str, provider: str, credential_type: CredentialType
    ) -> str:
-        try:
-            db_providers = (
-                session.query(BuiltinToolProvider)
-                .filter_by(
-                    tenant_id=tenant_id,
-                    provider=provider,
-                    credential_type=credential_type.value,
-                )
-                .order_by(BuiltinToolProvider.created_at.desc())
-                .all()
+        db_providers = (
+            session.query(BuiltinToolProvider)
+            .filter_by(
+                tenant_id=tenant_id,
+                provider=provider,
+                credential_type=credential_type.value,
            )
-
-            # Get the default name pattern
-            default_pattern = f"{credential_type.get_name()}"
-
-            # Find all names that match the default pattern: "{default_pattern} {number}"
-            pattern = rf"^{re.escape(default_pattern)}\s+(\d+)$"
-            numbers = []
-
-            for db_provider in db_providers:
-                if db_provider.name:
-                    match = re.match(pattern, db_provider.name.strip())
-                    if match:
-                        numbers.append(int(match.group(1)))
-
-            # If no default pattern names found, start with 1
-            if not numbers:
-                return f"{default_pattern} 1"
-
-            # Find the next number
-            max_number = max(numbers)
-            return f"{default_pattern} {max_number + 1}"
-        except Exception as e:
-            logger.warning("Error generating next provider name for %s: %s", provider, str(e))
-            # fallback
-            return f"{credential_type.get_name()} 1"
+            .order_by(BuiltinToolProvider.created_at.desc())
+            .all()
+        )
+        return generate_incremental_name(
+            [provider.name for provider in db_providers],
+            f"{credential_type.get_name()}",
+        )

    @staticmethod
    def get_builtin_tool_provider_credentials(
--- a/api/services/tools/tools_transform_service.py
+++ b/api/services/tools/tools_transform_service.py
@ -1,12 +1,14 @@
 import json
 import logging
-from typing import Any, Union, cast
+from collections.abc import Mapping
+from typing import Any, Union

 from yarl import URL

 from configs import dify_config
 from core.helper.provider_cache import ToolProviderCredentialsCache
 from core.mcp.types import Tool as MCPTool
+from core.plugin.entities.plugin_daemon import PluginDatasourceProviderEntity
 from core.tools.__base.tool import Tool
 from core.tools.__base.tool_runtime import ToolRuntime
 from core.tools.builtin_tool.provider import BuiltinToolProviderController
@ -38,7 +40,9 @@ class ToolTransformService:
        return str(url_prefix % {"tenant_id": tenant_id, "filename": filename})

    @classmethod
-    def get_tool_provider_icon_url(cls, provider_type: str, provider_name: str, icon: str | dict) -> Union[str, dict]:
+    def get_tool_provider_icon_url(
+        cls, provider_type: str, provider_name: str, icon: str | Mapping[str, str]
+    ) -> str | Mapping[str, str]:
        """
        get tool provider icon url
        """
@ -51,7 +55,7 @@ class ToolTransformService:
        elif provider_type in {ToolProviderType.API.value, ToolProviderType.WORKFLOW.value}:
            try:
                if isinstance(icon, str):
-                    return cast(dict, json.loads(icon))
+                    return json.loads(icon)
                return icon
            except Exception:
                return {"background": "#252525", "content": "\ud83d\ude01"}
@ -60,7 +64,7 @@ class ToolTransformService:
        return ""

    @staticmethod
-    def repack_provider(tenant_id: str, provider: Union[dict, ToolProviderApiEntity]):
+    def repack_provider(tenant_id: str, provider: Union[dict, ToolProviderApiEntity, PluginDatasourceProviderEntity]):
        """
        repack provider

@ -89,6 +93,12 @@ class ToolTransformService:
                    provider.icon_dark = ToolTransformService.get_tool_provider_icon_url(
                        provider_type=provider.type.value, provider_name=provider.name, icon=provider.icon_dark
                    )
+        elif isinstance(provider, PluginDatasourceProviderEntity):
+            if provider.plugin_id:
+                if isinstance(provider.declaration.identity.icon, str):
+                    provider.declaration.identity.icon = ToolTransformService.get_plugin_icon_url(
+                        tenant_id=tenant_id, filename=provider.declaration.identity.icon
+                    )

    @classmethod
    def builtin_provider_to_user_provider(
@ -106,7 +116,7 @@ class ToolTransformService:
            name=provider_controller.entity.identity.name,
            description=provider_controller.entity.identity.description,
            icon=provider_controller.entity.identity.icon,
-            icon_dark=provider_controller.entity.identity.icon_dark,
+            icon_dark=provider_controller.entity.identity.icon_dark or "",
            label=provider_controller.entity.identity.label,
            type=ToolProviderType.BUILT_IN,
            masked_credentials={},
@ -128,9 +138,10 @@ class ToolTransformService:
            )
        }

+        masked_creds = {}
        for name in schema:
-            if result.masked_credentials:
-                result.masked_credentials[name] = ""
+            masked_creds[name] = ""
+        result.masked_credentials = masked_creds

        # check if the provider need credentials
        if not provider_controller.need_credentials:
@ -208,7 +219,7 @@ class ToolTransformService:
            name=provider_controller.entity.identity.name,
            description=provider_controller.entity.identity.description,
            icon=provider_controller.entity.identity.icon,
-            icon_dark=provider_controller.entity.identity.icon_dark,
+            icon_dark=provider_controller.entity.identity.icon_dark or "",
            label=provider_controller.entity.identity.label,
            type=ToolProviderType.WORKFLOW,
            masked_credentials={},
@ -321,7 +332,7 @@ class ToolTransformService:

    @staticmethod
    def convert_tool_entity_to_api_entity(
-        tool: Union[ApiToolBundle, WorkflowTool, Tool],
+        tool: ApiToolBundle | WorkflowTool | Tool,
        tenant_id: str,
        labels: list[str] | None = None,
    ) -> ToolApiEntity:
@ -375,7 +386,7 @@ class ToolTransformService:
                parameters=merged_parameters,
                labels=labels or [],
            )
-        elif isinstance(tool, ApiToolBundle):
+        else:
            return ToolApiEntity(
                author=tool.author,
                name=tool.operation_id or "",
@ -384,9 +395,6 @@ class ToolTransformService:
                parameters=tool.parameters,
                labels=labels or [],
            )
-        else:
-            # Handle WorkflowTool case
-            raise ValueError(f"Unsupported tool type: {type(tool)}")

    @staticmethod
    def convert_builtin_provider_to_credential_entity(
--- a/api/services/variable_truncator.py
+++ b/api/services/variable_truncator.py
@ -0,0 +1,394 @@
+import dataclasses
+from collections.abc import Mapping
+from typing import Any, Generic, TypeAlias, TypeVar, overload
+
+from configs import dify_config
+from core.file.models import File
+from core.variables.segments import (
+    ArrayFileSegment,
+    ArraySegment,
+    BooleanSegment,
+    FileSegment,
+    FloatSegment,
+    IntegerSegment,
+    NoneSegment,
+    ObjectSegment,
+    Segment,
+    StringSegment,
+)
+from core.variables.utils import dumps_with_segments
+
+_MAX_DEPTH = 100
+
+
+class _QAKeys:
+    """dict keys for _QAStructure"""
+
+    QA_CHUNKS = "qa_chunks"
+    QUESTION = "question"
+    ANSWER = "answer"
+
+
+class _PCKeys:
+    """dict keys for _ParentChildStructure"""
+
+    PARENT_MODE = "parent_mode"
+    PARENT_CHILD_CHUNKS = "parent_child_chunks"
+    PARENT_CONTENT = "parent_content"
+    CHILD_CONTENTS = "child_contents"
+
+
+_T = TypeVar("_T")
+
+
+@dataclasses.dataclass(frozen=True)
+class _PartResult(Generic[_T]):
+    value: _T
+    value_size: int
+    truncated: bool
+
+
+class MaxDepthExceededError(Exception):
+    pass
+
+
+class UnknownTypeError(Exception):
+    pass
+
+
+JSONTypes: TypeAlias = int | float | str | list | dict | None | bool
+
+
+@dataclasses.dataclass(frozen=True)
+class TruncationResult:
+    result: Segment
+    truncated: bool
+
+
+class VariableTruncator:
+    """
+    Handles variable truncation with structure-preserving strategies.
+
+    This class implements intelligent truncation that prioritizes maintaining data structure
+    integrity while ensuring the final size doesn't exceed specified limits.
+
+    Uses recursive size calculation to avoid repeated JSON serialization.
+    """
+
+    def __init__(
+        self,
+        string_length_limit=5000,
+        array_element_limit: int = 20,
+        max_size_bytes: int = 1024_000,  # 100KB
+    ):
+        if string_length_limit <= 3:
+            raise ValueError("string_length_limit should be greater than 3.")
+        self._string_length_limit = string_length_limit
+
+        if array_element_limit <= 0:
+            raise ValueError("array_element_limit should be greater than 0.")
+        self._array_element_limit = array_element_limit
+
+        if max_size_bytes <= 0:
+            raise ValueError("max_size_bytes should be greater than 0.")
+        self._max_size_bytes = max_size_bytes
+
+    @classmethod
+    def default(cls) -> "VariableTruncator":
+        return VariableTruncator(
+            max_size_bytes=dify_config.WORKFLOW_VARIABLE_TRUNCATION_MAX_SIZE,
+            array_element_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_ARRAY_LENGTH,
+            string_length_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_STRING_LENGTH,
+        )
+
+    def truncate_variable_mapping(self, v: Mapping[str, Any]) -> tuple[Mapping[str, Any], bool]:
+        """
+        `truncate_variable_mapping` is responsible for truncating variable mappings
+        generated during workflow execution, such as `inputs`, `process_data`, or `outputs`
+        of a WorkflowNodeExecution record. This ensures the mappings remain within the
+        specified size limits while preserving their structure.
+        """
+        budget = self._max_size_bytes
+        is_truncated = False
+        truncated_mapping: dict[str, Any] = {}
+        length = len(v.items())
+        used_size = 0
+        for key, value in v.items():
+            used_size += self.calculate_json_size(key)
+            if used_size > budget:
+                truncated_mapping[key] = "..."
+                continue
+            value_budget = (budget - used_size) // (length - len(truncated_mapping))
+            if isinstance(value, Segment):
+                part_result = self._truncate_segment(value, value_budget)
+            else:
+                part_result = self._truncate_json_primitives(value, value_budget)
+            is_truncated = is_truncated or part_result.truncated
+            truncated_mapping[key] = part_result.value
+            used_size += part_result.value_size
+        return truncated_mapping, is_truncated
+
+    @staticmethod
+    def _segment_need_truncation(segment: Segment) -> bool:
+        if isinstance(
+            segment,
+            (NoneSegment, FloatSegment, IntegerSegment, FileSegment, BooleanSegment, ArrayFileSegment),
+        ):
+            return False
+        return True
+
+    @staticmethod
+    def _json_value_needs_truncation(value: Any) -> bool:
+        if value is None:
+            return False
+        if isinstance(value, (bool, int, float)):
+            return False
+        return True
+
+    def truncate(self, segment: Segment) -> TruncationResult:
+        if isinstance(segment, StringSegment):
+            result = self._truncate_segment(segment, self._string_length_limit)
+        else:
+            result = self._truncate_segment(segment, self._max_size_bytes)
+
+        if result.value_size > self._max_size_bytes:
+            if isinstance(result.value, str):
+                result = self._truncate_string(result.value, self._max_size_bytes)
+                return TruncationResult(StringSegment(value=result.value), True)
+
+            # Apply final fallback - convert to JSON string and truncate
+            json_str = dumps_with_segments(result.value, ensure_ascii=False)
+            if len(json_str) > self._max_size_bytes:
+                json_str = json_str[: self._max_size_bytes] + "..."
+            return TruncationResult(result=StringSegment(value=json_str), truncated=True)
+
+        return TruncationResult(
+            result=segment.model_copy(update={"value": result.value.value}), truncated=result.truncated
+        )
+
+    def _truncate_segment(self, segment: Segment, target_size: int) -> _PartResult[Segment]:
+        """
+        Apply smart truncation to a variable value.
+
+        Args:
+            value: The value to truncate (can be Segment or raw value)
+
+        Returns:
+            TruncationResult with truncated data and truncation status
+        """
+
+        if not VariableTruncator._segment_need_truncation(segment):
+            return _PartResult(segment, self.calculate_json_size(segment.value), False)
+
+        result: _PartResult[Any]
+        # Apply type-specific truncation with target size
+        if isinstance(segment, ArraySegment):
+            result = self._truncate_array(segment.value, target_size)
+        elif isinstance(segment, StringSegment):
+            result = self._truncate_string(segment.value, target_size)
+        elif isinstance(segment, ObjectSegment):
+            result = self._truncate_object(segment.value, target_size)
+        else:
+            raise AssertionError("this should be unreachable.")
+
+        return _PartResult(
+            value=segment.model_copy(update={"value": result.value}),
+            value_size=result.value_size,
+            truncated=result.truncated,
+        )
+
+    @staticmethod
+    def calculate_json_size(value: Any, depth=0) -> int:
+        """Recursively calculate JSON size without serialization."""
+        if isinstance(value, Segment):
+            return VariableTruncator.calculate_json_size(value.value)
+        if depth > _MAX_DEPTH:
+            raise MaxDepthExceededError()
+        if isinstance(value, str):
+            # Ideally, the size of strings should be calculated based on their utf-8 encoded length.
+            # However, this adds complexity as we would need to compute encoded sizes consistently
+            # throughout the code. Therefore, we approximate the size using the string's length.
+            # Rough estimate: number of characters, plus 2 for quotes
+            return len(value) + 2
+        elif isinstance(value, (int, float)):
+            return len(str(value))
+        elif isinstance(value, bool):
+            return 4 if value else 5  # "true" or "false"
+        elif value is None:
+            return 4  # "null"
+        elif isinstance(value, list):
+            # Size = sum of elements + separators + brackets
+            total = 2  # "[]"
+            for i, item in enumerate(value):
+                if i > 0:
+                    total += 1  # ","
+                total += VariableTruncator.calculate_json_size(item, depth=depth + 1)
+            return total
+        elif isinstance(value, dict):
+            # Size = sum of keys + values + separators + brackets
+            total = 2  # "{}"
+            for index, key in enumerate(value.keys()):
+                if index > 0:
+                    total += 1  # ","
+                total += VariableTruncator.calculate_json_size(str(key), depth=depth + 1)  # Key as string
+                total += 1  # ":"
+                total += VariableTruncator.calculate_json_size(value[key], depth=depth + 1)
+            return total
+        elif isinstance(value, File):
+            return VariableTruncator.calculate_json_size(value.model_dump(), depth=depth + 1)
+        else:
+            raise UnknownTypeError(f"got unknown type {type(value)}")
+
+    def _truncate_string(self, value: str, target_size: int) -> _PartResult[str]:
+        if (size := self.calculate_json_size(value)) < target_size:
+            return _PartResult(value, size, False)
+        if target_size < 5:
+            return _PartResult("...", 5, True)
+        truncated_size = min(self._string_length_limit, target_size - 5)
+        truncated_value = value[:truncated_size] + "..."
+        return _PartResult(truncated_value, self.calculate_json_size(truncated_value), True)
+
+    def _truncate_array(self, value: list, target_size: int) -> _PartResult[list]:
+        """
+        Truncate array with correct strategy:
+        1. First limit to 20 items
+        2. If still too large, truncate individual items
+        """
+
+        truncated_value: list[Any] = []
+        truncated = False
+        used_size = self.calculate_json_size([])
+
+        target_length = self._array_element_limit
+
+        for i, item in enumerate(value):
+            if i >= target_length:
+                return _PartResult(truncated_value, used_size, True)
+            if i > 0:
+                used_size += 1  # Account for comma
+
+            if used_size > target_size:
+                break
+
+            part_result = self._truncate_json_primitives(item, target_size - used_size)
+            truncated_value.append(part_result.value)
+            used_size += part_result.value_size
+            truncated = part_result.truncated
+        return _PartResult(truncated_value, used_size, truncated)
+
+    @classmethod
+    def _maybe_qa_structure(cls, m: Mapping[str, Any]) -> bool:
+        qa_chunks = m.get(_QAKeys.QA_CHUNKS)
+        if qa_chunks is None:
+            return False
+        if not isinstance(qa_chunks, list):
+            return False
+        return True
+
+    @classmethod
+    def _maybe_parent_child_structure(cls, m: Mapping[str, Any]) -> bool:
+        parent_mode = m.get(_PCKeys.PARENT_MODE)
+        if parent_mode is None:
+            return False
+        if not isinstance(parent_mode, str):
+            return False
+        parent_child_chunks = m.get(_PCKeys.PARENT_CHILD_CHUNKS)
+        if parent_child_chunks is None:
+            return False
+        if not isinstance(parent_child_chunks, list):
+            return False
+
+        return True
+
+    def _truncate_object(self, mapping: Mapping[str, Any], target_size: int) -> _PartResult[Mapping[str, Any]]:
+        """
+        Truncate object with key preservation priority.
+
+        Strategy:
+        1. Keep all keys, truncate values to fit within budget
+        2. If still too large, drop keys starting from the end
+        """
+        if not mapping:
+            return _PartResult(mapping, self.calculate_json_size(mapping), False)
+
+        truncated_obj = {}
+        truncated = False
+        used_size = self.calculate_json_size({})
+
+        # Sort keys to ensure deterministic behavior
+        sorted_keys = sorted(mapping.keys())
+
+        for i, key in enumerate(sorted_keys):
+            if used_size > target_size:
+                # No more room for additional key-value pairs
+                truncated = True
+                break
+
+            pair_size = 0
+
+            if i > 0:
+                pair_size += 1  # Account for comma
+
+            # Calculate budget for this key-value pair
+            # do not try to truncate keys, as we want to keep the structure of
+            # object.
+            key_size = self.calculate_json_size(key) + 1  # +1 for ":"
+            pair_size += key_size
+            remaining_pairs = len(sorted_keys) - i
+            value_budget = max(0, (target_size - pair_size - used_size) // remaining_pairs)
+
+            if value_budget <= 0:
+                truncated = True
+                break
+
+            # Truncate the value to fit within budget
+            value = mapping[key]
+            if isinstance(value, Segment):
+                value_result = self._truncate_segment(value, value_budget)
+            else:
+                value_result = self._truncate_json_primitives(mapping[key], value_budget)
+
+            truncated_obj[key] = value_result.value
+            pair_size += value_result.value_size
+            used_size += pair_size
+
+            if value_result.truncated:
+                truncated = True
+
+        return _PartResult(truncated_obj, used_size, truncated)
+
+    @overload
+    def _truncate_json_primitives(self, val: str, target_size: int) -> _PartResult[str]: ...
+
+    @overload
+    def _truncate_json_primitives(self, val: list, target_size: int) -> _PartResult[list]: ...
+
+    @overload
+    def _truncate_json_primitives(self, val: dict, target_size: int) -> _PartResult[dict]: ...
+
+    @overload
+    def _truncate_json_primitives(self, val: bool, target_size: int) -> _PartResult[bool]: ...  # type: ignore
+
+    @overload
+    def _truncate_json_primitives(self, val: int, target_size: int) -> _PartResult[int]: ...
+
+    @overload
+    def _truncate_json_primitives(self, val: float, target_size: int) -> _PartResult[float]: ...
+
+    @overload
+    def _truncate_json_primitives(self, val: None, target_size: int) -> _PartResult[None]: ...
+
+    def _truncate_json_primitives(
+        self, val: str | list | dict | bool | int | float | None, target_size: int
+    ) -> _PartResult[Any]:
+        """Truncate a value within an object to fit within budget."""
+        if isinstance(val, str):
+            return self._truncate_string(val, target_size)
+        elif isinstance(val, list):
+            return self._truncate_array(val, target_size)
+        elif isinstance(val, dict):
+            return self._truncate_object(val, target_size)
+        elif val is None or isinstance(val, (bool, int, float)):
+            return _PartResult(val, self.calculate_json_size(val), False)
+        else:
+            raise AssertionError("this statement should be unreachable.")
--- a/api/services/website_service.py
+++ b/api/services/website_service.py
@ -11,7 +11,7 @@ from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
 from core.rag.extractor.watercrawl.provider import WaterCrawlProvider
 from extensions.ext_redis import redis_client
 from extensions.ext_storage import storage
-from services.auth.api_key_auth_service import ApiKeyAuthService
+from services.datasource_provider_service import DatasourceProviderService


@dataclass
@ -103,7 +103,6 @@ class WebsiteCrawlStatusApiRequest:
    def from_args(cls, args: dict, job_id: str) -> "WebsiteCrawlStatusApiRequest":
        """Create from Flask-RESTful parsed arguments."""
        provider = args.get("provider")
-
        if not provider:
            raise ValueError("Provider is required")
        if not job_id:
@ -116,12 +115,28 @@ class WebsiteService:
    """Service class for website crawling operations using different providers."""

    @classmethod
-    def _get_credentials_and_config(cls, tenant_id: str, provider: str) -> tuple[dict, dict]:
+    def _get_credentials_and_config(cls, tenant_id: str, provider: str) -> tuple[Any, Any]:
        """Get and validate credentials for a provider."""
-        credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
-        if not credentials or "config" not in credentials:
-            raise ValueError("No valid credentials found for the provider")
-        return credentials, credentials["config"]
+        if provider == "firecrawl":
+            plugin_id = "langgenius/firecrawl_datasource"
+        elif provider == "watercrawl":
+            plugin_id = "langgenius/watercrawl_datasource"
+        elif provider == "jinareader":
+            plugin_id = "langgenius/jina_datasource"
+        else:
+            raise ValueError("Invalid provider")
+        datasource_provider_service = DatasourceProviderService()
+        credential = datasource_provider_service.get_datasource_credentials(
+            tenant_id=tenant_id,
+            provider=provider,
+            plugin_id=plugin_id,
+        )
+        if provider == "firecrawl":
+            return credential.get("firecrawl_api_key"), credential
+        elif provider in {"watercrawl", "jinareader"}:
+            return credential.get("api_key"), credential
+        else:
+            raise ValueError("Invalid provider")

    @classmethod
    def _get_decrypted_api_key(cls, tenant_id: str, config: dict) -> str:
@ -144,8 +159,7 @@ class WebsiteService:
        """Crawl a URL using the specified provider with typed request."""
        request = api_request.to_crawl_request()

-        _, config = cls._get_credentials_and_config(current_user.current_tenant_id, request.provider)
-        api_key = cls._get_decrypted_api_key(current_user.current_tenant_id, config)
+        api_key, config = cls._get_credentials_and_config(current_user.current_tenant_id, request.provider)

        if request.provider == "firecrawl":
            return cls._crawl_with_firecrawl(request=request, api_key=api_key, config=config)
@ -207,7 +221,7 @@ class WebsiteService:
                headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
            )
            if response.json().get("code") != 200:
-                raise ValueError("Failed to crawl")
+                raise ValueError("Failed to crawl:")
            return {"status": "active", "data": response.json().get("data")}
        else:
            response = requests.post(
@ -235,8 +249,7 @@ class WebsiteService:
    @classmethod
    def get_crawl_status_typed(cls, api_request: WebsiteCrawlStatusApiRequest) -> dict[str, Any]:
        """Get crawl status using typed request."""
-        _, config = cls._get_credentials_and_config(current_user.current_tenant_id, api_request.provider)
-        api_key = cls._get_decrypted_api_key(current_user.current_tenant_id, config)
+        api_key, config = cls._get_credentials_and_config(current_user.current_tenant_id, api_request.provider)

        if api_request.provider == "firecrawl":
            return cls._get_firecrawl_status(api_request.job_id, api_key, config)
@ -310,8 +323,7 @@ class WebsiteService:

    @classmethod
    def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[str, Any] | None:
-        _, config = cls._get_credentials_and_config(tenant_id, provider)
-        api_key = cls._get_decrypted_api_key(tenant_id, config)
+        api_key, config = cls._get_credentials_and_config(tenant_id, provider)

        if provider == "firecrawl":
            return cls._get_firecrawl_url_data(job_id, url, api_key, config)
@ -384,8 +396,7 @@ class WebsiteService:
    def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict[str, Any]:
        request = ScrapeRequest(provider=provider, url=url, tenant_id=tenant_id, only_main_content=only_main_content)

-        _, config = cls._get_credentials_and_config(tenant_id=request.tenant_id, provider=request.provider)
-        api_key = cls._get_decrypted_api_key(tenant_id=request.tenant_id, config=config)
+        api_key, config = cls._get_credentials_and_config(tenant_id=request.tenant_id, provider=request.provider)

        if request.provider == "firecrawl":
            return cls._scrape_with_firecrawl(request=request, api_key=api_key, config=config)
--- a/api/services/workflow/workflow_converter.py
+++ b/api/services/workflow/workflow_converter.py
@ -146,7 +146,7 @@ class WorkflowConverter:
            graph=graph,
            model_config=app_config.model,
            prompt_template=app_config.prompt_template,
-            file_upload=app_config.additional_features.file_upload,
+            file_upload=app_config.additional_features.file_upload if app_config.additional_features else None,
            external_data_variable_node_mapping=external_data_variable_node_mapping,
        )

--- a/api/services/workflow_app_service.py
+++ b/api/services/workflow_app_service.py
@ -4,7 +4,7 @@ from datetime import datetime
 from sqlalchemy import and_, func, or_, select
 from sqlalchemy.orm import Session

-from core.workflow.entities.workflow_execution import WorkflowExecutionStatus
+from core.workflow.enums import WorkflowExecutionStatus
 from models import Account, App, EndUser, WorkflowAppLog, WorkflowRun
 from models.enums import CreatorUserRole

--- a/api/services/workflow_draft_variable_service.py
+++ b/api/services/workflow_draft_variable_service.py
@ -1,32 +1,44 @@
 import dataclasses
+import json
 import logging
 from collections.abc import Mapping, Sequence
+from concurrent.futures import ThreadPoolExecutor
 from enum import StrEnum
 from typing import Any, ClassVar

-from sqlalchemy import Engine, orm
+from sqlalchemy import Engine, orm, select
 from sqlalchemy.dialects.postgresql import insert
 from sqlalchemy.orm import Session, sessionmaker
 from sqlalchemy.sql.expression import and_, or_

+from configs import dify_config
 from core.app.entities.app_invoke_entities import InvokeFrom
 from core.file.models import File
 from core.variables import Segment, StringSegment, Variable
 from core.variables.consts import SELECTORS_LENGTH
-from core.variables.segments import ArrayFileSegment, FileSegment
+from core.variables.segments import (
+    ArrayFileSegment,
+    FileSegment,
+)
 from core.variables.types import SegmentType
+from core.variables.utils import dumps_with_segments
 from core.workflow.constants import CONVERSATION_VARIABLE_NODE_ID, ENVIRONMENT_VARIABLE_NODE_ID, SYSTEM_VARIABLE_NODE_ID
 from core.workflow.enums import SystemVariableKey
 from core.workflow.nodes import NodeType
 from core.workflow.nodes.variable_assigner.common.helpers import get_updated_variables
 from core.workflow.variable_loader import VariableLoader
+from extensions.ext_storage import storage
 from factories.file_factory import StorageKeyLoader
 from factories.variable_factory import build_segment, segment_to_variable
 from libs.datetime_utils import naive_utc_now
+from libs.uuid_utils import uuidv7
 from models import App, Conversation
+from models.account import Account
 from models.enums import DraftVariableType
-from models.workflow import Workflow, WorkflowDraftVariable, is_system_variable_editable
+from models.workflow import Workflow, WorkflowDraftVariable, WorkflowDraftVariableFile, is_system_variable_editable
 from repositories.factory import DifyAPIRepositoryFactory
+from services.file_service import FileService
+from services.variable_truncator import VariableTruncator

 logger = logging.getLogger(__name__)

@ -37,6 +49,12 @@ class WorkflowDraftVariableList:
    total: int | None = None


+@dataclasses.dataclass(frozen=True)
+class DraftVarFileDeletion:
+    draft_var_id: str
+    draft_var_file_id: str
+
+
 class WorkflowDraftVariableError(Exception):
    pass

@ -87,7 +105,26 @@ class DraftVarLoader(VariableLoader):
            srv = WorkflowDraftVariableService(session)
            draft_vars = srv.get_draft_variables_by_selectors(self._app_id, selectors)

+        # Important:
+        files: list[File] = []
+        # FileSegment and ArrayFileSegment are not subject to offloading, so their values
+        # can be safely accessed before any offloading logic is applied.
        for draft_var in draft_vars:
+            value = draft_var.get_value()
+            if isinstance(value, FileSegment):
+                files.append(value.value)
+            elif isinstance(value, ArrayFileSegment):
+                files.extend(value.value)
+        with Session(bind=self._engine) as session:
+            storage_key_loader = StorageKeyLoader(session, tenant_id=self._tenant_id)
+            storage_key_loader.load_storage_keys(files)
+
+        offloaded_draft_vars = []
+        for draft_var in draft_vars:
+            if draft_var.is_truncated():
+                offloaded_draft_vars.append(draft_var)
+                continue
+
            segment = draft_var.get_value()
            variable = segment_to_variable(
                segment=segment,
@ -99,20 +136,51 @@ class DraftVarLoader(VariableLoader):
            selector_tuple = self._selector_to_tuple(variable.selector)
            variable_by_selector[selector_tuple] = variable

-        # Important:
-        files: list[File] = []
-        for draft_var in draft_vars:
-            value = draft_var.get_value()
-            if isinstance(value, FileSegment):
-                files.append(value.value)
-            elif isinstance(value, ArrayFileSegment):
-                files.extend(value.value)
-        with Session(bind=self._engine) as session:
-            storage_key_loader = StorageKeyLoader(session, tenant_id=self._tenant_id)
-            storage_key_loader.load_storage_keys(files)
+        # Load offloaded variables using multithreading.
+        # This approach reduces loading time by querying external systems concurrently.
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            offloaded_variables = executor.map(self._load_offloaded_variable, offloaded_draft_vars)
+            for selector, variable in offloaded_variables:
+                variable_by_selector[selector] = variable

        return list(variable_by_selector.values())

+    def _load_offloaded_variable(self, draft_var: WorkflowDraftVariable) -> tuple[tuple[str, str], Variable]:
+        # This logic is closely tied to `WorkflowDraftVaribleService._try_offload_large_variable`
+        # and must remain synchronized with it.
+        # Ideally, these should be co-located for better maintainability.
+        # However, due to the current code structure, this is not straightforward.
+
+        variable_file = draft_var.variable_file
+        assert variable_file is not None
+        upload_file = variable_file.upload_file
+        assert upload_file is not None
+        content = storage.load(upload_file.key)
+        if variable_file.value_type == SegmentType.STRING:
+            # The inferenced type is StringSegment, which is not correct inside this function.
+            segment: Segment = StringSegment(value=content.decode())
+
+            variable = segment_to_variable(
+                segment=segment,
+                selector=draft_var.get_selector(),
+                id=draft_var.id,
+                name=draft_var.name,
+                description=draft_var.description,
+            )
+            return (draft_var.node_id, draft_var.name), variable
+
+        deserialized = json.loads(content)
+        segment = WorkflowDraftVariable.build_segment_with_type(variable_file.value_type, deserialized)
+        variable = segment_to_variable(
+            segment=segment,
+            selector=draft_var.get_selector(),
+            id=draft_var.id,
+            name=draft_var.name,
+            description=draft_var.description,
+        )
+        # No special handling needed for  ArrayFileSegment, as we do not offload ArrayFileSegment
+        return (draft_var.node_id, draft_var.name), variable
+

 class WorkflowDraftVariableService:
    _session: Session
@ -138,13 +206,24 @@ class WorkflowDraftVariableService:
        )

    def get_variable(self, variable_id: str) -> WorkflowDraftVariable | None:
-        return self._session.query(WorkflowDraftVariable).where(WorkflowDraftVariable.id == variable_id).first()
+        return (
+            self._session.query(WorkflowDraftVariable)
+            .options(orm.selectinload(WorkflowDraftVariable.variable_file))
+            .where(WorkflowDraftVariable.id == variable_id)
+            .first()
+        )

    def get_draft_variables_by_selectors(
        self,
        app_id: str,
        selectors: Sequence[list[str]],
    ) -> list[WorkflowDraftVariable]:
+        """
+        Retrieve WorkflowDraftVariable instances based on app_id and selectors.
+
+        The returned WorkflowDraftVariable objects are guaranteed to have their
+        associated variable_file and variable_file.upload_file relationships preloaded.
+        """
        ors = []
        for selector in selectors:
            assert len(selector) >= SELECTORS_LENGTH, f"Invalid selector to get: {selector}"
@ -159,7 +238,14 @@ class WorkflowDraftVariableService:
        # combined using `UNION` to fetch all rows.
        # Benchmarking indicates that both approaches yield comparable performance.
        variables = (
-            self._session.query(WorkflowDraftVariable).where(WorkflowDraftVariable.app_id == app_id, or_(*ors)).all()
+            self._session.query(WorkflowDraftVariable)
+            .options(
+                orm.selectinload(WorkflowDraftVariable.variable_file).selectinload(
+                    WorkflowDraftVariableFile.upload_file
+                )
+            )
+            .where(WorkflowDraftVariable.app_id == app_id, or_(*ors))
+            .all()
        )
        return variables

@ -170,8 +256,10 @@ class WorkflowDraftVariableService:
        if page == 1:
            total = query.count()
        variables = (
-            # Do not load the `value` field.
-            query.options(orm.defer(WorkflowDraftVariable.value))
+            # Do not load the `value` field
+            query.options(
+                orm.defer(WorkflowDraftVariable.value, raiseload=True),
+            )
            .order_by(WorkflowDraftVariable.created_at.desc())
            .limit(limit)
            .offset((page - 1) * limit)
@ -186,7 +274,11 @@ class WorkflowDraftVariableService:
            WorkflowDraftVariable.node_id == node_id,
        )
        query = self._session.query(WorkflowDraftVariable).where(*criteria)
-        variables = query.order_by(WorkflowDraftVariable.created_at.desc()).all()
+        variables = (
+            query.options(orm.selectinload(WorkflowDraftVariable.variable_file))
+            .order_by(WorkflowDraftVariable.created_at.desc())
+            .all()
+        )
        return WorkflowDraftVariableList(variables=variables)

    def list_node_variables(self, app_id: str, node_id: str) -> WorkflowDraftVariableList:
@ -210,6 +302,7 @@ class WorkflowDraftVariableService:
    def _get_variable(self, app_id: str, node_id: str, name: str) -> WorkflowDraftVariable | None:
        variable = (
            self._session.query(WorkflowDraftVariable)
+            .options(orm.selectinload(WorkflowDraftVariable.variable_file))
            .where(
                WorkflowDraftVariable.app_id == app_id,
                WorkflowDraftVariable.node_id == node_id,
@ -278,7 +371,7 @@ class WorkflowDraftVariableService:
            self._session.flush()
            return None

-        outputs_dict = node_exec.outputs_dict or {}
+        outputs_dict = node_exec.load_full_outputs(self._session, storage) or {}
        # a sentinel value used to check the absent of the output variable key.
        absent = object()

@ -323,6 +416,49 @@ class WorkflowDraftVariableService:
            return self._reset_node_var_or_sys_var(workflow, variable)

    def delete_variable(self, variable: WorkflowDraftVariable):
+        if not variable.is_truncated():
+            self._session.delete(variable)
+            return
+
+        variable_query = (
+            select(WorkflowDraftVariable)
+            .options(
+                orm.selectinload(WorkflowDraftVariable.variable_file).selectinload(
+                    WorkflowDraftVariableFile.upload_file
+                ),
+            )
+            .where(WorkflowDraftVariable.id == variable.id)
+        )
+        variable_reloaded = self._session.execute(variable_query).scalars().first()
+        if variable_reloaded is None:
+            logger.warning("Associated WorkflowDraftVariable not found, draft_var_id=%s", variable.id)
+            self._session.delete(variable)
+            return
+        variable_file = variable_reloaded.variable_file
+        if variable_file is None:
+            logger.warning(
+                "Associated WorkflowDraftVariableFile not found, draft_var_id=%s, file_id=%s",
+                variable_reloaded.id,
+                variable_reloaded.file_id,
+            )
+            self._session.delete(variable)
+            return
+
+        upload_file = variable_file.upload_file
+        if upload_file is None:
+            logger.warning(
+                "Associated UploadFile not found, draft_var_id=%s, file_id=%s, upload_file_id=%s",
+                variable_reloaded.id,
+                variable_reloaded.file_id,
+                variable_file.upload_file_id,
+            )
+            self._session.delete(variable)
+            self._session.delete(variable_file)
+            return
+
+        storage.delete(upload_file.key)
+        self._session.delete(upload_file)
+        self._session.delete(upload_file)
        self._session.delete(variable)

    def delete_workflow_variables(self, app_id: str):
@ -332,6 +468,38 @@ class WorkflowDraftVariableService:
            .delete(synchronize_session=False)
        )

+    def delete_workflow_draft_variable_file(self, deletions: list[DraftVarFileDeletion]):
+        variable_files_query = (
+            select(WorkflowDraftVariableFile)
+            .options(orm.selectinload(WorkflowDraftVariableFile.upload_file))
+            .where(WorkflowDraftVariableFile.id.in_([i.draft_var_file_id for i in deletions]))
+        )
+        variable_files = self._session.execute(variable_files_query).scalars().all()
+        variable_files_by_id = {i.id: i for i in variable_files}
+        for i in deletions:
+            variable_file = variable_files_by_id.get(i.draft_var_file_id)
+            if variable_file is None:
+                logger.warning(
+                    "Associated WorkflowDraftVariableFile not found, draft_var_id=%s, file_id=%s",
+                    i.draft_var_id,
+                    i.draft_var_file_id,
+                )
+                continue
+
+            upload_file = variable_file.upload_file
+            if upload_file is None:
+                logger.warning(
+                    "Associated UploadFile not found, draft_var_id=%s, file_id=%s, upload_file_id=%s",
+                    i.draft_var_id,
+                    i.draft_var_file_id,
+                    variable_file.upload_file_id,
+                )
+                self._session.delete(variable_file)
+            else:
+                storage.delete(upload_file.key)
+                self._session.delete(upload_file)
+                self._session.delete(variable_file)
+
    def delete_node_variables(self, app_id: str, node_id: str):
        return self._delete_node_variables(app_id, node_id)

@ -476,6 +644,7 @@ def _batch_upsert_draft_variable(
                "visible": stmt.excluded.visible,
                "editable": stmt.excluded.editable,
                "node_execution_id": stmt.excluded.node_execution_id,
+                "file_id": stmt.excluded.file_id,
            },
        )
    elif policy == _UpsertPolicy.IGNORE:
@ -495,6 +664,7 @@ def _model_to_insertion_dict(model: WorkflowDraftVariable) -> dict[str, Any]:
        "value_type": model.value_type,
        "value": model.value,
        "node_execution_id": model.node_execution_id,
+        "file_id": model.file_id,
    }
    if model.visible is not None:
        d["visible"] = model.visible
@ -524,6 +694,28 @@ def _build_segment_for_serialized_values(v: Any) -> Segment:
    return build_segment(WorkflowDraftVariable.rebuild_file_types(v))


+def _make_filename_trans_table() -> dict[int, str]:
+    linux_chars = ["/", "\x00"]
+    windows_chars = [
+        "<",
+        ">",
+        ":",
+        '"',
+        "/",
+        "\\",
+        "|",
+        "?",
+        "*",
+    ]
+    windows_chars.extend(chr(i) for i in range(32))
+
+    trans_table = dict.fromkeys(linux_chars + windows_chars, "_")
+    return str.maketrans(trans_table)
+
+
+_FILENAME_TRANS_TABLE = _make_filename_trans_table()
+
+
 class DraftVariableSaver:
    # _DUMMY_OUTPUT_IDENTITY is a placeholder output for workflow nodes.
    # Its sole possible value is `None`.
@ -573,6 +765,7 @@ class DraftVariableSaver:
        node_id: str,
        node_type: NodeType,
        node_execution_id: str,
+        user: Account,
        enclosing_node_id: str | None = None,
    ):
        # Important: `node_execution_id` parameter refers to the primary key (`id`) of the
@ -583,6 +776,7 @@ class DraftVariableSaver:
        self._node_id = node_id
        self._node_type = node_type
        self._node_execution_id = node_execution_id
+        self._user = user
        self._enclosing_node_id = enclosing_node_id

    def _create_dummy_output_variable(self):
@ -692,17 +886,133 @@ class DraftVariableSaver:
            else:
                value_seg = _build_segment_for_serialized_values(value)
            draft_vars.append(
-                WorkflowDraftVariable.new_node_variable(
-                    app_id=self._app_id,
-                    node_id=self._node_id,
+                self._create_draft_variable(
                    name=name,
-                    node_execution_id=self._node_execution_id,
                    value=value_seg,
-                    visible=self._should_variable_be_visible(self._node_id, self._node_type, name),
-                )
+                    visible=True,
+                    editable=True,
+                ),
+                # WorkflowDraftVariable.new_node_variable(
+                #     app_id=self._app_id,
+                #     node_id=self._node_id,
+                #     name=name,
+                #     node_execution_id=self._node_execution_id,
+                #     value=value_seg,
+                #     visible=self._should_variable_be_visible(self._node_id, self._node_type, name),
+                # )
            )
        return draft_vars

+    def _generate_filename(self, name: str):
+        node_id_escaped = self._node_id.translate(_FILENAME_TRANS_TABLE)
+        return f"{node_id_escaped}-{name}"
+
+    def _try_offload_large_variable(
+        self,
+        name: str,
+        value_seg: Segment,
+    ) -> tuple[Segment, WorkflowDraftVariableFile] | None:
+        # This logic is closely tied to `DraftVarLoader._load_offloaded_variable` and must remain
+        # synchronized with it.
+        # Ideally, these should be co-located for better maintainability.
+        # However, due to the current code structure, this is not straightforward.
+        truncator = VariableTruncator(
+            max_size_bytes=dify_config.WORKFLOW_VARIABLE_TRUNCATION_MAX_SIZE,
+            array_element_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_ARRAY_LENGTH,
+            string_length_limit=dify_config.WORKFLOW_VARIABLE_TRUNCATION_STRING_LENGTH,
+        )
+        truncation_result = truncator.truncate(value_seg)
+        if not truncation_result.truncated:
+            return None
+
+        original_length = None
+        if isinstance(value_seg.value, (list, dict)):
+            original_length = len(value_seg.value)
+
+        # Prepare content for storage
+        if isinstance(value_seg, StringSegment):
+            # For string types, store as plain text
+            original_content_serialized = value_seg.value
+            content_type = "text/plain"
+            filename = f"{self._generate_filename(name)}.txt"
+        else:
+            # For other types, store as JSON
+            original_content_serialized = dumps_with_segments(value_seg.value, ensure_ascii=False)
+            content_type = "application/json"
+            filename = f"{self._generate_filename(name)}.json"
+
+        original_size = len(original_content_serialized.encode("utf-8"))
+
+        bind = self._session.get_bind()
+        assert isinstance(bind, Engine)
+        file_srv = FileService(bind)
+
+        upload_file = file_srv.upload_file(
+            filename=filename,
+            content=original_content_serialized.encode(),
+            mimetype=content_type,
+            user=self._user,
+        )
+
+        # Create WorkflowDraftVariableFile record
+        variable_file = WorkflowDraftVariableFile(
+            id=uuidv7(),
+            upload_file_id=upload_file.id,
+            size=original_size,
+            length=original_length,
+            value_type=value_seg.value_type,
+            app_id=self._app_id,
+            tenant_id=self._user.current_tenant_id,
+            user_id=self._user.id,
+        )
+        engine = bind = self._session.get_bind()
+        assert isinstance(engine, Engine)
+        with Session(bind=engine, expire_on_commit=False) as session:
+            session.add(variable_file)
+            session.commit()
+
+        return truncation_result.result, variable_file
+
+    def _create_draft_variable(
+        self,
+        *,
+        name: str,
+        value: Segment,
+        visible: bool = True,
+        editable: bool = True,
+    ) -> WorkflowDraftVariable:
+        """Create a draft variable with large variable handling and truncation."""
+        # Handle Segment values
+
+        offload_result = self._try_offload_large_variable(name, value)
+
+        if offload_result is None:
+            # Create the draft variable
+            draft_var = WorkflowDraftVariable.new_node_variable(
+                app_id=self._app_id,
+                node_id=self._node_id,
+                name=name,
+                node_execution_id=self._node_execution_id,
+                value=value,
+                visible=visible,
+                editable=editable,
+            )
+            return draft_var
+        else:
+            truncated, var_file = offload_result
+            # Create the draft variable
+            draft_var = WorkflowDraftVariable.new_node_variable(
+                app_id=self._app_id,
+                node_id=self._node_id,
+                name=name,
+                node_execution_id=self._node_execution_id,
+                value=truncated,
+                visible=visible,
+                editable=False,
+                file_id=var_file.id,
+            )
+            return draft_var
+
    def save(
        self,
        process_data: Mapping[str, Any] | None = None,
--- a/api/services/workflow_service.py
+++ b/api/services/workflow_service.py
@ -3,7 +3,6 @@ import time
 import uuid
 from collections.abc import Callable, Generator, Mapping, Sequence
 from typing import Any, cast
-from uuid import uuid4

 from sqlalchemy import exists, select
 from sqlalchemy.orm import Session, sessionmaker
@ -15,22 +14,20 @@ from core.file import File
 from core.repositories import DifyCoreRepositoryFactory
 from core.variables import Variable
 from core.variables.variables import VariableUnion
-from core.workflow.entities.node_entities import NodeRunResult
-from core.workflow.entities.variable_pool import VariablePool
-from core.workflow.entities.workflow_node_execution import WorkflowNodeExecution, WorkflowNodeExecutionStatus
+from core.workflow.entities import VariablePool, WorkflowNodeExecution
+from core.workflow.enums import ErrorStrategy, WorkflowNodeExecutionMetadataKey, WorkflowNodeExecutionStatus
 from core.workflow.errors import WorkflowNodeRunFailedError
-from core.workflow.graph_engine.entities.event import InNodeEvent
+from core.workflow.graph_events import GraphNodeEventBase, NodeRunFailedEvent, NodeRunSucceededEvent
+from core.workflow.node_events import NodeRunResult
 from core.workflow.nodes import NodeType
-from core.workflow.nodes.base.node import BaseNode
-from core.workflow.nodes.enums import ErrorStrategy
-from core.workflow.nodes.event import RunCompletedEvent
-from core.workflow.nodes.event.types import NodeEvent
+from core.workflow.nodes.base.node import Node
 from core.workflow.nodes.node_mapping import LATEST_VERSION, NODE_TYPE_CLASSES_MAPPING
 from core.workflow.nodes.start.entities import StartNodeData
 from core.workflow.system_variable import SystemVariable
 from core.workflow.workflow_entry import WorkflowEntry
 from events.app_event import app_draft_workflow_was_synced, app_published_workflow_was_updated
 from extensions.ext_database import db
+from extensions.ext_storage import storage
 from factories.file_factory import build_from_mapping, build_from_mappings
 from libs.datetime_utils import naive_utc_now
 from models.account import Account
@ -276,12 +273,13 @@ class WorkflowService:
            type=draft_workflow.type,
            version=Workflow.version_from_datetime(naive_utc_now()),
            graph=draft_workflow.graph,
-            features=draft_workflow.features,
            created_by=account.id,
            environment_variables=draft_workflow.environment_variables,
            conversation_variables=draft_workflow.conversation_variables,
            marked_name=marked_name,
            marked_comment=marked_comment,
+            rag_pipeline_variables=draft_workflow.rag_pipeline_variables,
+            features=draft_workflow.features,
        )

        # commit db session changes
@ -565,12 +563,12 @@ class WorkflowService:
            # This will prevent validation errors from breaking the workflow
            return []

-    def get_default_block_configs(self) -> list[dict]:
+    def get_default_block_configs(self) -> Sequence[Mapping[str, object]]:
        """
        Get default block configs
        """
        # return default block config
-        default_block_configs = []
+        default_block_configs: list[Mapping[str, object]] = []
        for node_class_mapping in NODE_TYPE_CLASSES_MAPPING.values():
            node_class = node_class_mapping[LATEST_VERSION]
            default_config = node_class.get_default_config()
@ -579,7 +577,9 @@ class WorkflowService:

        return default_block_configs

-    def get_default_block_config(self, node_type: str, filters: dict | None = None) -> dict | None:
+    def get_default_block_config(
+        self, node_type: str, filters: Mapping[str, object] | None = None
+    ) -> Mapping[str, object]:
        """
        Get default config of node.
        :param node_type: node type
@ -590,12 +590,12 @@ class WorkflowService:

        # return default block config
        if node_type_enum not in NODE_TYPE_CLASSES_MAPPING:
-            return None
+            return {}

        node_class = NODE_TYPE_CLASSES_MAPPING[node_type_enum][LATEST_VERSION]
        default_config = node_class.get_default_config(filters=filters)
        if not default_config:
-            return None
+            return {}

        return default_config

@ -677,7 +677,7 @@ class WorkflowService:

        # run draft workflow node
        start_at = time.perf_counter()
-        node_execution = self._handle_node_run_result(
+        node_execution = self._handle_single_step_result(
            invoke_node_fn=lambda: run,
            start_at=start_at,
            node_id=node_id,
@ -699,6 +699,9 @@ class WorkflowService:
        if workflow_node_execution is None:
            raise ValueError(f"WorkflowNodeExecution with id {node_execution.id} not found after saving")

+        with Session(db.engine) as session:
+            outputs = workflow_node_execution.load_full_outputs(session, storage)
+
        with Session(bind=db.engine) as session, session.begin():
            draft_var_saver = DraftVariableSaver(
                session=session,
@ -707,8 +710,9 @@ class WorkflowService:
                node_type=NodeType(workflow_node_execution.node_type),
                enclosing_node_id=enclosing_node_id,
                node_execution_id=node_execution.id,
+                user=account,
            )
-            draft_var_saver.save(process_data=node_execution.process_data, outputs=node_execution.outputs)
+            draft_var_saver.save(process_data=node_execution.process_data, outputs=outputs)
            session.commit()

        return workflow_node_execution
@ -722,7 +726,7 @@ class WorkflowService:
        # run free workflow node
        start_at = time.perf_counter()

-        node_execution = self._handle_node_run_result(
+        node_execution = self._handle_single_step_result(
            invoke_node_fn=lambda: WorkflowEntry.run_free_node(
                node_id=node_id,
                node_data=node_data,
@ -736,103 +740,131 @@ class WorkflowService:

        return node_execution

-    def _handle_node_run_result(
+    def _handle_single_step_result(
        self,
-        invoke_node_fn: Callable[[], tuple[BaseNode, Generator[NodeEvent | InNodeEvent, None, None]]],
+        invoke_node_fn: Callable[[], tuple[Node, Generator[GraphNodeEventBase, None, None]]],
        start_at: float,
        node_id: str,
    ) -> WorkflowNodeExecution:
-        try:
-            node, node_events = invoke_node_fn()
+        """
+        Handle single step execution and return WorkflowNodeExecution.

-            node_run_result: NodeRunResult | None = None
-            for event in node_events:
-                if isinstance(event, RunCompletedEvent):
-                    node_run_result = event.run_result
+        Args:
+            invoke_node_fn: Function to invoke node execution
+            start_at: Execution start time
+            node_id: ID of the node being executed

-                    # sign output files
-                    # node_run_result.outputs = WorkflowEntry.handle_special_values(node_run_result.outputs)
-                    break
+        Returns:
+            WorkflowNodeExecution: The execution result
+        """
+        node, node_run_result, run_succeeded, error = self._execute_node_safely(invoke_node_fn)

-            if not node_run_result:
-                raise ValueError("Node run failed with no run result")
-            # single step debug mode error handling return
-            if node_run_result.status == WorkflowNodeExecutionStatus.FAILED and node.continue_on_error:
-                node_error_args: dict[str, Any] = {
-                    "status": WorkflowNodeExecutionStatus.EXCEPTION,
-                    "error": node_run_result.error,
-                    "inputs": node_run_result.inputs,
-                    "metadata": {"error_strategy": node.error_strategy},
-                }
-                if node.error_strategy is ErrorStrategy.DEFAULT_VALUE:
-                    node_run_result = NodeRunResult(
-                        **node_error_args,
-                        outputs={
-                            **node.default_value_dict,
-                            "error_message": node_run_result.error,
-                            "error_type": node_run_result.error_type,
-                        },
-                    )
-                else:
-                    node_run_result = NodeRunResult(
-                        **node_error_args,
-                        outputs={
-                            "error_message": node_run_result.error,
-                            "error_type": node_run_result.error_type,
-                        },
-                    )
-            run_succeeded = node_run_result.status in (
-                WorkflowNodeExecutionStatus.SUCCEEDED,
-                WorkflowNodeExecutionStatus.EXCEPTION,
-            )
-            error = node_run_result.error if not run_succeeded else None
-        except WorkflowNodeRunFailedError as e:
-            node = e.node
-            run_succeeded = False
-            node_run_result = None
-            error = e.error
-
-        # Create a NodeExecution domain model
+        # Create base node execution
        node_execution = WorkflowNodeExecution(
-            id=str(uuid4()),
-            workflow_id="",  # This is a single-step execution, so no workflow ID
+            id=str(uuid.uuid4()),
+            workflow_id="",  # Single-step execution has no workflow ID
            index=1,
            node_id=node_id,
-            node_type=node.type_,
+            node_type=node.node_type,
            title=node.title,
            elapsed_time=time.perf_counter() - start_at,
            created_at=naive_utc_now(),
            finished_at=naive_utc_now(),
        )

+        # Populate execution result data
+        self._populate_execution_result(node_execution, node_run_result, run_succeeded, error)
+
+        return node_execution
+
+    def _execute_node_safely(
+        self, invoke_node_fn: Callable[[], tuple[Node, Generator[GraphNodeEventBase, None, None]]]
+    ) -> tuple[Node, NodeRunResult | None, bool, str | None]:
+        """
+        Execute node safely and handle errors according to error strategy.
+
+        Returns:
+            Tuple of (node, node_run_result, run_succeeded, error)
+        """
+        try:
+            node, node_events = invoke_node_fn()
+            node_run_result = next(
+                (
+                    event.node_run_result
+                    for event in node_events
+                    if isinstance(event, (NodeRunSucceededEvent, NodeRunFailedEvent))
+                ),
+                None,
+            )
+
+            if not node_run_result:
+                raise ValueError("Node execution failed - no result returned")
+
+            # Apply error strategy if node failed
+            if node_run_result.status == WorkflowNodeExecutionStatus.FAILED and node.error_strategy:
+                node_run_result = self._apply_error_strategy(node, node_run_result)
+
+            run_succeeded = node_run_result.status in (
+                WorkflowNodeExecutionStatus.SUCCEEDED,
+                WorkflowNodeExecutionStatus.EXCEPTION,
+            )
+            error = node_run_result.error if not run_succeeded else None
+            return node, node_run_result, run_succeeded, error
+        except WorkflowNodeRunFailedError as e:
+            node = e.node
+            run_succeeded = False
+            node_run_result = None
+            error = e.error
+            return node, node_run_result, run_succeeded, error
+
+    def _apply_error_strategy(self, node: Node, node_run_result: NodeRunResult) -> NodeRunResult:
+        """Apply error strategy when node execution fails."""
+        # TODO(Novice): Maybe we should apply error strategy to node level?
+        error_outputs = {
+            "error_message": node_run_result.error,
+            "error_type": node_run_result.error_type,
+        }
+
+        # Add default values if strategy is DEFAULT_VALUE
+        if node.error_strategy is ErrorStrategy.DEFAULT_VALUE:
+            error_outputs.update(node.default_value_dict)
+
+        return NodeRunResult(
+            status=WorkflowNodeExecutionStatus.EXCEPTION,
+            error=node_run_result.error,
+            inputs=node_run_result.inputs,
+            metadata={WorkflowNodeExecutionMetadataKey.ERROR_STRATEGY: node.error_strategy},
+            outputs=error_outputs,
+        )
+
+    def _populate_execution_result(
+        self,
+        node_execution: WorkflowNodeExecution,
+        node_run_result: NodeRunResult | None,
+        run_succeeded: bool,
+        error: str | None,
+    ) -> None:
+        """Populate node execution with result data."""
        if run_succeeded and node_run_result:
-            # Set inputs, process_data, and outputs as dictionaries (not JSON strings)
-            inputs = WorkflowEntry.handle_special_values(node_run_result.inputs) if node_run_result.inputs else None
-            process_data = (
+            node_execution.inputs = (
+                WorkflowEntry.handle_special_values(node_run_result.inputs) if node_run_result.inputs else None
+            )
+            node_execution.process_data = (
                WorkflowEntry.handle_special_values(node_run_result.process_data)
                if node_run_result.process_data
                else None
            )
-            outputs = node_run_result.outputs
-
-            node_execution.inputs = inputs
-            node_execution.process_data = process_data
-            node_execution.outputs = outputs
+            node_execution.outputs = node_run_result.outputs
            node_execution.metadata = node_run_result.metadata

-            # Map status from WorkflowNodeExecutionStatus to NodeExecutionStatus
-            if node_run_result.status == WorkflowNodeExecutionStatus.SUCCEEDED:
-                node_execution.status = WorkflowNodeExecutionStatus.SUCCEEDED
-            elif node_run_result.status == WorkflowNodeExecutionStatus.EXCEPTION:
-                node_execution.status = WorkflowNodeExecutionStatus.EXCEPTION
+            # Set status and error based on result
+            node_execution.status = node_run_result.status
+            if node_run_result.status == WorkflowNodeExecutionStatus.EXCEPTION:
                node_execution.error = node_run_result.error
        else:
-            # Set failed status and error
            node_execution.status = WorkflowNodeExecutionStatus.FAILED
            node_execution.error = error

-        return node_execution
-
    def convert_to_workflow(self, app_model: App, account: Account, args: dict) -> App:
        """
        Basic mode of chatbot app(expert mode) to workflow