169 lines
7.3 KiB
Python
169 lines
7.3 KiB
Python
# import os
|
|
# os.environ['CUDA_VISIBLE_DEVICES'] = '1'
|
|
|
|
import json
|
|
import argparse
|
|
import torch
|
|
import openai
|
|
import logging
|
|
|
|
from tqdm import tqdm
|
|
from typing import Optional
|
|
|
|
from easyjailbreak.attacker import *
|
|
from easyjailbreak.datasets import JailbreakDataset
|
|
from easyjailbreak.models.huggingface_model import from_pretrained
|
|
from easyjailbreak.models.openai_model import OpenaiModel
|
|
from easyjailbreak.models.anthropic_model import AnthropicModel
|
|
from easyjailbreak.models.gemini_model import GenaiModel
|
|
|
|
jailbreak_method_dict = {
|
|
"AutoDAN": AutoDAN,
|
|
"Cipher": Cipher,
|
|
"CodeChameleon": CodeChameleon,
|
|
"DeepInception": DeepInception,
|
|
"GCG": GCG,
|
|
"GPTFuzzer": GPTFuzzer,
|
|
"Jailbroken": Jailbroken,
|
|
"Multilingual": Multilingual,
|
|
"PAIR": PAIR,
|
|
"ReNeLLM": ReNeLLM,
|
|
"PIG": PIG,
|
|
"PIG-R": GCA,
|
|
"PIG-E": PIGEON,
|
|
"PIG-D": PIA
|
|
}
|
|
|
|
|
|
class Hacker(object):
|
|
def __init__(self, attack_model_path, attack_model_name, target_model_path, target_model_name, eval_model_path,
|
|
eval_model_name, data_path, dataset_name, output_json_filepath, output_json_filename, max_new_tokens,
|
|
ds_size: Optional[int] = None, data_file_type: Optional[str] = 'json'):
|
|
self.attack_model_name = attack_model_name
|
|
self.target_model_name = target_model_name
|
|
self.eval_model_name = eval_model_name
|
|
|
|
self.max_new_tokens = max_new_tokens
|
|
|
|
self.target_model = self.load_model(target_model_path, target_model_name)
|
|
self.attack_model = self.load_model(attack_model_path, attack_model_name)
|
|
self.eval_model = self.load_model(eval_model_path, eval_model_name)
|
|
|
|
self.dataset = JailbreakDataset(data_path, local_file_type=data_file_type, ds_size=ds_size)
|
|
|
|
self.output_json_filepath = output_json_filepath
|
|
self.output_json_filename = output_json_filename
|
|
self.save_path = self.output_json_filepath + '/' + self.output_json_filename
|
|
self.dataset_name = dataset_name
|
|
|
|
def load_model(self, model_path, model_name):
|
|
if model_name == 'None':
|
|
return None
|
|
elif 'openai' in model_path:
|
|
return OpenaiModel(
|
|
model_name=model_name,
|
|
base_url="https://api.ai.cs.ac.cn/v1",
|
|
# base_url="https://openkey.cloud/v1",
|
|
api_keys="sk-1ZZ2ij3jHwwLo8zN0ESQWflomiPAXhWcRgMlRmMPwbZchshy",
|
|
generation_config={
|
|
'max_tokens': self.max_new_tokens,
|
|
}
|
|
)
|
|
elif 'claude' in model_path:
|
|
return AnthropicModel(
|
|
model_name=model_name,
|
|
base_url="https://api.ai.cs.ac.cn",
|
|
api_keys="sk-1ZZ2ij3jHwwLo8zN0ESQWflomiPAXhWcRgMlRmMPwbZchshy",
|
|
generation_config={
|
|
'max_tokens': self.max_new_tokens,
|
|
}
|
|
)
|
|
elif 'gemini' in model_path:
|
|
return GenaiModel(
|
|
model_name=model_name,
|
|
api_keys="AIzaSyBTD8Kv9KuUI2F35SvM2GOICoZIafvaze0",
|
|
generation_config={
|
|
'max_tokens': self.max_new_tokens,
|
|
}
|
|
)
|
|
else:
|
|
return from_pretrained(
|
|
model_name_or_path=model_path,
|
|
model_name=model_name,
|
|
max_new_tokens=self.max_new_tokens,
|
|
)
|
|
|
|
def jailbreak(self, jailbreak_method):
|
|
if jailbreak_method in jailbreak_method_dict.keys():
|
|
attacker = jailbreak_method_dict[jailbreak_method](
|
|
attack_model=self.attack_model,
|
|
target_model=self.target_model,
|
|
eval_model=self.eval_model,
|
|
jailbreak_datasets=self.dataset,
|
|
save_path=self.save_path,
|
|
dataset_name=self.dataset_name
|
|
)
|
|
attacker.attack()
|
|
# attacker.log()
|
|
else:
|
|
# no jailbreak
|
|
with open(self.save_path, 'w') as f:
|
|
for instance in tqdm(self.dataset):
|
|
if self.dataset_name == 'trustllm':
|
|
self.target_model.set_system_message(instance.system_message)
|
|
|
|
if jailbreak_method == 'ICA' and self.dataset_name == 'trustllm':
|
|
if self.target_model_name == 'llama-2':
|
|
instance.jailbreak_prompt = instance.context + f" {instance.query}"
|
|
elif self.target_model_name == 'llama-3':
|
|
instance.jailbreak_prompt = instance.context + f" {instance.query}"
|
|
else:
|
|
instance.jailbreak_prompt = instance.context + f" {instance.query}"
|
|
elif self.dataset_name == 'enron':
|
|
instance.jailbreak_prompt = instance.context + instance.query
|
|
elif self.dataset_name == 'advbench':
|
|
instance.jailbreak_prompt = instance.query + '\n' + instance.reference_responses[0]
|
|
elif jailbreak_method == 'BlackBox':
|
|
pass
|
|
else:
|
|
instance.jailbreak_prompt = instance.query
|
|
# print(instance.jailbreak_prompt)
|
|
answer = self.target_model.generate(instance.jailbreak_prompt)
|
|
instance.target_responses.append(answer)
|
|
|
|
line = instance.to_dict()
|
|
f.write(json.dumps(line, ensure_ascii=False) + '\n')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--data_path', type=str, help='the path of the data file')
|
|
parser.add_argument('--dataset_name', type=str, help='the name of the dataset')
|
|
parser.add_argument('--dataset_version', type=str, help='the version of the dataset')
|
|
parser.add_argument('--attack_model_path', type=str, help='the path of the attack model')
|
|
parser.add_argument('--attack_model_name', type=str, help='the name of the attack model')
|
|
parser.add_argument('--target_model_path', type=str, help='the path of the target model')
|
|
parser.add_argument('--target_model_name', type=str, help='the name of the target model')
|
|
parser.add_argument('--eval_model_path', type=str, help='the path of the eval model')
|
|
parser.add_argument('--eval_model_name', type=str, help='the name of the eval model')
|
|
parser.add_argument('--jailbreak_method', type=str, help='how to jailbreak the target model')
|
|
parser.add_argument('--output_json_filepath', type=str, help='filepath of the output json file')
|
|
parser.add_argument('--output_json_filename', type=str, help='filename of the output json file')
|
|
args = parser.parse_args()
|
|
|
|
print(args)
|
|
|
|
hacker = Hacker(
|
|
max_new_tokens=128,
|
|
data_path=args.data_path,
|
|
dataset_name=args.dataset_name,
|
|
attack_model_name=args.attack_model_name,
|
|
attack_model_path=args.attack_model_path,
|
|
target_model_name=args.target_model_name,
|
|
target_model_path=args.target_model_path,
|
|
eval_model_name=args.eval_model_name,
|
|
eval_model_path=args.eval_model_path,
|
|
output_json_filepath=args.output_json_filepath,
|
|
output_json_filename=args.output_json_filename,
|
|
)
|
|
hacker.jailbreak(args.jailbreak_method)
|