fix: Use the identifier column as hash if available (#9405)

* fix: Use the identifier column as hash if available

* Update kb_ingest.py

* [autofix.ci] apply automated fixes

* Update src/backend/base/langflow/components/data/kb_ingest.py

Co-authored-by: Edwin Jose <edwin.jose@datastax.com>

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Edwin Jose <edwin.jose@datastax.com>
This commit is contained in:
Eric Hare 2025-08-19 14:21:41 -07:00 committed by GitHub
commit e63e879af6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 13 additions and 7 deletions

View file

@ -139,8 +139,8 @@ class KBIngestionComponent(Component):
{
"column_name": "text",
"vectorize": True,
"identifier": False,
}
"identifier": True,
},
],
),
IntInput(
@ -402,16 +402,22 @@ class KBIngestionComponent(Component):
# Convert each row to a Data object
for _, row in df_source.iterrows():
# Build content text from vectorized columns using list comprehension
content_parts = [str(row[col]) for col in content_cols if col in row and pd.notna(row[col])]
# Build content text from identifier columns using list comprehension
identifier_parts = [str(row[col]) for col in content_cols if col in row and pd.notna(row[col])]
page_content = " ".join(content_parts)
# Join all parts into a single string
page_content = " ".join(identifier_parts)
# Build metadata from NON-vectorized columns only (simple key-value pairs)
data_dict = {
"text": page_content, # Main content for vectorization
}
# Add identifier columns if they exist
if identifier_cols:
identifier_parts = [str(row[col]) for col in identifier_cols if col in row and pd.notna(row[col])]
page_content = " ".join(identifier_parts)
# Add metadata columns as simple key-value pairs
for col in df_source.columns:
if col not in content_cols and col in row and pd.notna(row[col]):

File diff suppressed because one or more lines are too long