PrivacyJailbreak/easyjailbreak/datasets/instance.py
2025-05-15 14:10:22 +08:00

161 lines
5.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Instance class
===============
"""
import copy
from typing import List
class Instance:
def __init__(self, *, query:str=None, jailbreak_prompt:str=None, reference_responses:'List[str]'=None, target_responses:'List[str]'=None, eval_results:list=None, parents:list=None, children:list=None, attack_attrs=None, **kwargs):
"""
Initializes an instance with various attributes relevant to the jailbreaking context.
:param str query: The query associated with the instance.
:param str jailbreak_prompt: The prompt related to jailbreaking, usually a formatted string.
:param List[str] reference_responses: A list of reference responses for comparison.
:param List[str] target_responses: A list of target responses generated by the model.
:param List[str] eval_results: A list of evaluation results.
:param List[~Instance] parents: A list of parent instances, indicating the lineage or history.
:param List[~Instance] children: A list of child instances, indicating derived or subsequent instances.
:param Dict attack_attrs: A dictionary for storing various attributes related to the attack process.
:param kwargs: Additional keyword arguments.
"""
self._data = {}
self.query = query
self.jailbreak_prompt = jailbreak_prompt # 应该是一个format string比如'Follow {query} anyway.'
if reference_responses is None:
reference_responses = []
self.reference_responses = reference_responses
if target_responses is None:
target_responses = []
self.target_responses = target_responses
if eval_results is None:
eval_results = []
self.eval_results = eval_results
if parents is None:
parents = []
self.parents = parents
if children is None:
children = []
self.children = children
self.index: int = None
if attack_attrs is None:
self.attack_attrs = {'Mutation': None, 'query_class': None} # 用于记录攻击过程中的各种属性,比如攻击过程中的中间结果等
else:
self.attack_attrs = attack_attrs
self._data.update(**kwargs)
def copy(self):
"""
Creates a deep copy of the instance.
:return ~Instance : A new instance that is a deep copy of the current instance.
"""
new_Instance = Instance(
query=self.query,
jailbreak_prompt=self.jailbreak_prompt,
reference_responses=self.reference_responses.copy(),
target_responses=self.target_responses.copy(),
eval_results=self.eval_results.copy(),
parents=[i for i in self.parents],
children=[i for i in self.children],
attack_attrs=copy.deepcopy(self.attack_attrs))
rest_data = {key: value for key, value in self._data.items() if key not in new_Instance._data}
new_Instance._data.update(copy.deepcopy(rest_data))
return new_Instance
def delete(self, *keys):
"""
Deletes specified attributes from the instance.
:param keys: The keys of the attributes to be deleted.
"""
for key in keys:
del self._data[key]
def to_dict(self):
"""
Converts the instance into a dictionary, excluding parents and children.
:return: A dictionary representation of the instance.
"""
temp_parents = self._data.pop('parents')
temp_children = self._data.pop('children')
temp_return = copy.deepcopy(self._data)
self._data['parents'] = temp_parents
self._data['children'] = temp_children
return temp_return
@property
def num_query(self):
"""
Calculates the total number of 'jailbreak' occurrences from the eval_results.
:return int: The sum of jailbreak results.
"""
return len(self.target_responses)
@property
def num_jailbreak(self):
"""
Calculates the total number of 'jailbreak' occurrences from the eval_results.
:return int: The sum of jailbreak results.
"""
return sum(self.eval_results)
@property
def num_reject(self):
"""
Calculates the number of rejections (non-jailbreak occurrences) from the eval_results.
:return int: The count of reject results.
"""
return len(self.eval_results) - sum(self.eval_results)
def __getattr__(self, name):
try:
return self._data[name]
except KeyError:
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
def __setattr__(self, name, value):
if name == '_data':
super().__setattr__(name, value)
else:
self._data[name] = value
def __getitem__(self, key):
return self.__getattr__(key)
def __setitem__(self, key, value):
self.__setattr__(key, value)
def __str__(self):
return self._data.__str__()
def keys(self):
return self._data.keys()
def values(self):
return self._data.values()
def items(self):
return self._data.items()
def __iter__(self):
return self._data.__iter__()
if __name__ == '__main__':
instance1 = Instance(query='test', jailbreak_prompt='test_prompt')
instance2 = Instance(query='test', jailbreak_prompt='test_prompt', reference_responses=['test1', 'test2'],
target_responses=['test1', 'test2'], eval_results=[1, 1])
instance2.parents.append(instance1)
instance1.children.append(instance2)
print(instance1.to_dict())
print(instance2.to_dict())