PrivacyJailbreak/easyjailbreak/attacker/CodeChameleon_2024.py
2025-05-15 14:10:22 +08:00

116 lines
4.5 KiB
Python

"""
CodeChameleon Class
============================================
A novel framework for jailbreaking in LLMs based on
personalized encryption and decryption.
Paper title: CodeChameleon: Personalized Encryption Framework for Jailbreaking Large Language Models
arXiv Link: https://arxiv.org/abs/2402.16717
"""
import json
import logging
logging.basicConfig(level=logging.INFO)
from easyjailbreak.metrics.Evaluator import EvaluatorGenerativeJudge
from easyjailbreak.attacker import AttackerBase
from easyjailbreak.datasets import JailbreakDataset, Instance
from easyjailbreak.mutation.rule import *
from tqdm import tqdm
__all__ = ['CodeChameleon']
class CodeChameleon(AttackerBase):
r"""
Implementation of CodeChameleon Jailbreak Challenges in Large Language Models
"""
def __init__(self, attack_model, target_model, eval_model, jailbreak_datasets: JailbreakDataset, save_path, dataset_name):
r"""
:param attack_model: The attack_model is used to generate the adversarial prompt. In this case, the attack_model should be set as None.
:param target_model: The target language model to be attacked.
:param eval_model: The evaluation model to evaluate the attack results.
:param jailbreak_datasets: The dataset to be attacked.
:param template_file: The file path of the template.
"""
super().__init__(attack_model, target_model, eval_model, jailbreak_datasets)
self.mutations = [
BinaryTree(attr_name='query'),
Length(attr_name='query'),
Reverse(attr_name='query'),
OddEven(attr_name='query'),
]
self.evaluator = EvaluatorGenerativeJudge(eval_model)
self.current_jailbreak = 0
self.current_query = 0
self.current_reject = 0
self.save_path = save_path
self.dataset_name = dataset_name
def single_attack(self, instance: Instance) -> JailbreakDataset:
r"""
single attack process using provided prompts and mutation methods.
:param instance: The Instance that is attacked.
"""
instance_ds = JailbreakDataset([instance])
source_instance_list = []
updated_instance_list = []
for mutation in self.mutations:
transformed_jailbreak_datasets = mutation(instance_ds)
for item in transformed_jailbreak_datasets:
source_instance_list.append(item)
for instance in source_instance_list:
answer = self.target_model.generate(instance.jailbreak_prompt.format(decryption_function = instance.decryption_function, query = instance.query))
instance.target_responses.append(answer)
updated_instance_list.append(instance)
return JailbreakDataset(updated_instance_list)
def attack(self):
r"""
Execute the attack process using provided prompts and mutations.
"""
logging.info("Jailbreak started!")
self.attack_results = JailbreakDataset([])
try:
with open(self.save_path, 'w') as f:
for Instance in tqdm(self.jailbreak_datasets):
if self.dataset_name == 'trustllm':
self.target_model.set_system_message(Instance.system_message)
results = self.single_attack(Instance)
for new_instance in results:
self.attack_results.add(new_instance)
line = new_instance.to_dict()
f.write(json.dumps(line, ensure_ascii=False) + '\n')
except KeyboardInterrupt:
logging.info("Jailbreak interrupted by user!")
# self.evaluator(self.attack_results)
# self.update(self.attack_results)
logging.info("Jailbreak finished!")
def update(self, Dataset: JailbreakDataset):
r"""
Update the state of the Jailbroken based on the evaluation results of Datasets.
:param Dataset: The Dataset that is attacked.
"""
for prompt_node in Dataset:
self.current_jailbreak += prompt_node.num_jailbreak
self.current_query += prompt_node.num_query
self.current_reject += prompt_node.num_reject
def log(self):
r"""
Report the attack results.
"""
logging.info("======Jailbreak report:======")
logging.info(f"Total queries: {self.current_query}")
logging.info(f"Total jailbreak: {self.current_jailbreak}")
logging.info(f"Total reject: {self.current_reject}")
logging.info("========Report End===========")