feat: Add JSON Cleaner Component (#2584)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
This commit is contained in:
Hamish Arnold 2024-07-12 17:26:14 +01:00 committed by GitHub
commit c522a7aae7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 114 additions and 6 deletions

23
poetry.lock generated
View file

@ -2616,8 +2616,8 @@ files = [
[package.dependencies]
cffi = {version = ">=1.12.2", markers = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""}
greenlet = [
{version = ">=2.0.0", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.11\""},
{version = ">=3.0rc3", markers = "platform_python_implementation == \"CPython\" and python_version >= \"3.11\""},
{version = ">=2.0.0", markers = "platform_python_implementation == \"CPython\" and python_version < \"3.11\""},
]
"zope.event" = "*"
"zope.interface" = "*"
@ -2776,12 +2776,12 @@ files = [
google-auth = ">=2.14.1,<3.0.dev0"
googleapis-common-protos = ">=1.56.2,<2.0.dev0"
grpcio = [
{version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
{version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
{version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
]
grpcio-status = [
{version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
{version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""},
{version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""},
]
proto-plus = ">=1.22.3,<2.0.0dev"
protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
@ -4241,6 +4241,17 @@ files = [
{file = "jq-1.7.0.tar.gz", hash = "sha256:f460d1f2c3791617e4fb339fa24efbdbebe672b02c861f057358553642047040"},
]
[[package]]
name = "json-repair"
version = "0.25.2"
description = "A package to repair broken json strings"
optional = false
python-versions = ">=3.7"
files = [
{file = "json_repair-0.25.2-py3-none-any.whl", hash = "sha256:51d67295c3184b6c41a3572689661c6128cef6cfc9fb04db63130709adfc5bf0"},
{file = "json_repair-0.25.2.tar.gz", hash = "sha256:161a56d7e6bbfd4cad3a614087e3e0dbd0e10d402dd20dc7db418432428cb32b"},
]
[[package]]
name = "jsonpatch"
version = "1.33"
@ -4979,8 +4990,8 @@ psutil = ">=5.9.1"
pywin32 = {version = "*", markers = "platform_system == \"Windows\""}
pyzmq = ">=25.0.0"
requests = [
{version = ">=2.26.0", markers = "python_version <= \"3.11\""},
{version = ">=2.32.2", markers = "python_version > \"3.11\""},
{version = ">=2.26.0", markers = "python_version <= \"3.11\""},
]
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = {version = ">=4.6.0", markers = "python_version < \"3.11\""}
@ -6518,9 +6529,9 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""},
{version = ">=1.22.4,<2", markers = "python_version < \"3.11\""},
{version = ">=1.23.2,<2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@ -11315,4 +11326,4 @@ local = ["ctransformers", "llama-cpp-python", "sentence-transformers"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.13"
content-hash = "0dcc235615bcef9db6b669b27a2588f58121bf6233545a469b08206cb5a6e63b"
content-hash = "e4d237d2f128824e707fb5e2c0ba645495fc0947ba868099aac39a3e4424ac5d"

View file

@ -94,10 +94,14 @@ langchain-aws = "^0.1.6"
langchain-mongodb = "^0.1.6"
kubernetes = "^30.1.0"
firecrawl-py = "^0.0.16"
json-repair = "^0.25.2"
langchain-nvidia-ai-endpoints = "^0.1.2"
langchain-google-calendar-tools = "^0.0.1"
[tool.poetry.group.dev.dependencies]
types-redis = "^4.6.0.5"
ipykernel = "^6.29.0"

View file

@ -0,0 +1,93 @@
import json
import re
import unicodedata
from langflow.custom import Component
from langflow.inputs import MessageTextInput, BoolInput
from langflow.template import Output
from langflow.schema.message import Message
class JSONCleaner(Component):
display_name = "JSON Cleaner"
description = "Cleans the messy and sometimes incorrect JSON strings produced by LLMs so that they are fully compliant with the JSON spec."
icon = "custom_components"
inputs = [
MessageTextInput(
name="json_str", display_name="JSON String", info="The JSON string to be cleaned.", required=True
),
BoolInput(
name="remove_control_chars",
display_name="Remove Control Characters",
info="Remove control characters from the JSON string.",
required=False,
),
BoolInput(
name="normalize_unicode",
display_name="Normalize Unicode",
info="Normalize Unicode characters in the JSON string.",
required=False,
),
BoolInput(
name="validate_json",
display_name="Validate JSON",
info="Validate the JSON string to ensure it is well-formed.",
required=False,
),
]
outputs = [
Output(display_name="Cleaned JSON String", name="output", method="clean_json"),
]
def clean_json(self) -> Message:
try:
from json_repair import repair_json # type: ignore
except ImportError:
raise ImportError(
"Could not import the json_repair package." "Please install it with `pip install json_repair`."
)
"""Clean the input JSON string based on provided options and return the cleaned JSON string."""
json_str = self.json_str
remove_control_chars = self.remove_control_chars
normalize_unicode = self.normalize_unicode
validate_json = self.validate_json
try:
start = json_str.find("{")
end = json_str.rfind("}")
if start == -1 or end == -1:
raise ValueError("Invalid JSON string: Missing '{' or '}'")
json_str = json_str[start : end + 1]
if remove_control_chars:
json_str = self._remove_control_characters(json_str)
if normalize_unicode:
json_str = self._normalize_unicode(json_str)
if validate_json:
json_str = self._validate_json(json_str)
cleaned_json_str = repair_json(json_str)
result = str(cleaned_json_str)
self.status = result
return Message(text=result)
except Exception as e:
raise ValueError(f"Error cleaning JSON string: {str(e)}")
def _remove_control_characters(self, s: str) -> str:
"""Remove control characters from the string."""
return re.sub(r"[\x00-\x1F\x7F]", "", s)
def _normalize_unicode(self, s: str) -> str:
"""Normalize Unicode characters in the string."""
return unicodedata.normalize("NFC", s)
def _validate_json(self, s: str) -> str:
"""Validate the JSON string."""
try:
json.loads(s)
return s
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON string: {str(e)}")