Feat: chunk overlap supported (#2209)
Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
parent
3322710dac
commit
89fcf4ea7c
9 changed files with 53 additions and 8 deletions
|
|
@ -562,7 +562,7 @@ class IndexingRunner:
|
|||
|
||||
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
|
||||
chunk_size=segmentation["max_tokens"],
|
||||
chunk_overlap=0,
|
||||
chunk_overlap=segmentation.get('chunk_overlap', 0),
|
||||
fixed_separator=separator,
|
||||
separators=["\n\n", "。", ".", " ", ""],
|
||||
embedding_model_instance=embedding_model_instance
|
||||
|
|
@ -571,7 +571,7 @@ class IndexingRunner:
|
|||
# Automatic segmentation
|
||||
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
|
||||
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
|
||||
chunk_overlap=0,
|
||||
chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
|
||||
separators=["\n\n", "。", ".", " ", ""],
|
||||
embedding_model_instance=embedding_model_instance
|
||||
)
|
||||
|
|
|
|||
|
|
@ -134,7 +134,8 @@ class DatasetProcessRule(db.Model):
|
|||
],
|
||||
'segmentation': {
|
||||
'delimiter': '\n',
|
||||
'max_tokens': 1000
|
||||
'max_tokens': 500,
|
||||
'chunk_overlap': 50
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -241,7 +241,8 @@ class DocumentService:
|
|||
],
|
||||
'segmentation': {
|
||||
'delimiter': '\n',
|
||||
'max_tokens': 500
|
||||
'max_tokens': 500,
|
||||
'chunk_overlap': 50
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue