161 lines
5.6 KiB
Python
161 lines
5.6 KiB
Python
"""
|
||
Instance class
|
||
===============
|
||
"""
|
||
import copy
|
||
from typing import List
|
||
|
||
class Instance:
|
||
def __init__(self, *, query:str=None, jailbreak_prompt:str=None, reference_responses:'List[str]'=None, target_responses:'List[str]'=None, eval_results:list=None, parents:list=None, children:list=None, attack_attrs=None, **kwargs):
|
||
"""
|
||
Initializes an instance with various attributes relevant to the jailbreaking context.
|
||
|
||
:param str query: The query associated with the instance.
|
||
:param str jailbreak_prompt: The prompt related to jailbreaking, usually a formatted string.
|
||
:param List[str] reference_responses: A list of reference responses for comparison.
|
||
:param List[str] target_responses: A list of target responses generated by the model.
|
||
:param List[str] eval_results: A list of evaluation results.
|
||
:param List[~Instance] parents: A list of parent instances, indicating the lineage or history.
|
||
:param List[~Instance] children: A list of child instances, indicating derived or subsequent instances.
|
||
:param Dict attack_attrs: A dictionary for storing various attributes related to the attack process.
|
||
:param kwargs: Additional keyword arguments.
|
||
"""
|
||
self._data = {}
|
||
self.query = query
|
||
self.jailbreak_prompt = jailbreak_prompt # 应该是一个format string,比如'Follow {query} anyway.'
|
||
if reference_responses is None:
|
||
reference_responses = []
|
||
self.reference_responses = reference_responses
|
||
if target_responses is None:
|
||
target_responses = []
|
||
self.target_responses = target_responses
|
||
if eval_results is None:
|
||
eval_results = []
|
||
self.eval_results = eval_results
|
||
if parents is None:
|
||
parents = []
|
||
self.parents = parents
|
||
if children is None:
|
||
children = []
|
||
self.children = children
|
||
self.index: int = None
|
||
|
||
if attack_attrs is None:
|
||
self.attack_attrs = {'Mutation': None, 'query_class': None} # 用于记录攻击过程中的各种属性,比如攻击过程中的中间结果等
|
||
else:
|
||
self.attack_attrs = attack_attrs
|
||
self._data.update(**kwargs)
|
||
|
||
def copy(self):
|
||
"""
|
||
Creates a deep copy of the instance.
|
||
|
||
:return ~Instance : A new instance that is a deep copy of the current instance.
|
||
"""
|
||
new_Instance = Instance(
|
||
query=self.query,
|
||
jailbreak_prompt=self.jailbreak_prompt,
|
||
reference_responses=self.reference_responses.copy(),
|
||
target_responses=self.target_responses.copy(),
|
||
eval_results=self.eval_results.copy(),
|
||
parents=[i for i in self.parents],
|
||
children=[i for i in self.children],
|
||
attack_attrs=copy.deepcopy(self.attack_attrs))
|
||
rest_data = {key: value for key, value in self._data.items() if key not in new_Instance._data}
|
||
new_Instance._data.update(copy.deepcopy(rest_data))
|
||
return new_Instance
|
||
|
||
def delete(self, *keys):
|
||
"""
|
||
Deletes specified attributes from the instance.
|
||
|
||
:param keys: The keys of the attributes to be deleted.
|
||
"""
|
||
for key in keys:
|
||
del self._data[key]
|
||
|
||
def to_dict(self):
|
||
"""
|
||
Converts the instance into a dictionary, excluding parents and children.
|
||
|
||
:return: A dictionary representation of the instance.
|
||
"""
|
||
temp_parents = self._data.pop('parents')
|
||
temp_children = self._data.pop('children')
|
||
temp_return = copy.deepcopy(self._data)
|
||
self._data['parents'] = temp_parents
|
||
self._data['children'] = temp_children
|
||
return temp_return
|
||
|
||
@property
|
||
def num_query(self):
|
||
"""
|
||
Calculates the total number of 'jailbreak' occurrences from the eval_results.
|
||
|
||
:return int: The sum of jailbreak results.
|
||
"""
|
||
return len(self.target_responses)
|
||
|
||
@property
|
||
def num_jailbreak(self):
|
||
"""
|
||
Calculates the total number of 'jailbreak' occurrences from the eval_results.
|
||
|
||
:return int: The sum of jailbreak results.
|
||
"""
|
||
return sum(self.eval_results)
|
||
|
||
@property
|
||
def num_reject(self):
|
||
"""
|
||
Calculates the number of rejections (non-jailbreak occurrences) from the eval_results.
|
||
|
||
:return int: The count of reject results.
|
||
"""
|
||
return len(self.eval_results) - sum(self.eval_results)
|
||
|
||
|
||
|
||
def __getattr__(self, name):
|
||
try:
|
||
return self._data[name]
|
||
except KeyError:
|
||
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
|
||
|
||
def __setattr__(self, name, value):
|
||
if name == '_data':
|
||
super().__setattr__(name, value)
|
||
else:
|
||
self._data[name] = value
|
||
|
||
def __getitem__(self, key):
|
||
return self.__getattr__(key)
|
||
|
||
def __setitem__(self, key, value):
|
||
self.__setattr__(key, value)
|
||
|
||
def __str__(self):
|
||
return self._data.__str__()
|
||
|
||
def keys(self):
|
||
return self._data.keys()
|
||
|
||
def values(self):
|
||
return self._data.values()
|
||
|
||
def items(self):
|
||
return self._data.items()
|
||
|
||
def __iter__(self):
|
||
return self._data.__iter__()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
instance1 = Instance(query='test', jailbreak_prompt='test_prompt')
|
||
instance2 = Instance(query='test', jailbreak_prompt='test_prompt', reference_responses=['test1', 'test2'],
|
||
target_responses=['test1', 'test2'], eval_results=[1, 1])
|
||
instance2.parents.append(instance1)
|
||
instance1.children.append(instance2)
|
||
|
||
print(instance1.to_dict())
|
||
print(instance2.to_dict())
|