# import os # os.environ['CUDA_VISIBLE_DEVICES'] = '1' import json import argparse import torch import openai import logging from tqdm import tqdm from typing import Optional from easyjailbreak.attacker import * from easyjailbreak.datasets import JailbreakDataset from easyjailbreak.models.huggingface_model import from_pretrained from easyjailbreak.models.openai_model import OpenaiModel from easyjailbreak.models.anthropic_model import AnthropicModel from easyjailbreak.models.gemini_model import GenaiModel jailbreak_method_dict = { "AutoDAN": AutoDAN, "Cipher": Cipher, "CodeChameleon": CodeChameleon, "DeepInception": DeepInception, "GCG": GCG, "GPTFuzzer": GPTFuzzer, "Jailbroken": Jailbroken, "Multilingual": Multilingual, "PAIR": PAIR, "ReNeLLM": ReNeLLM, "PIG": PIG, "PIG-R": GCA, "PIG-E": PIGEON, "PIG-D": PIA } class Hacker(object): def __init__(self, attack_model_path, attack_model_name, target_model_path, target_model_name, eval_model_path, eval_model_name, data_path, dataset_name, output_json_filepath, output_json_filename, max_new_tokens, ds_size: Optional[int] = None, data_file_type: Optional[str] = 'json'): self.attack_model_name = attack_model_name self.target_model_name = target_model_name self.eval_model_name = eval_model_name self.max_new_tokens = max_new_tokens self.target_model = self.load_model(target_model_path, target_model_name) self.attack_model = self.load_model(attack_model_path, attack_model_name) self.eval_model = self.load_model(eval_model_path, eval_model_name) self.dataset = JailbreakDataset(data_path, local_file_type=data_file_type, ds_size=ds_size) self.output_json_filepath = output_json_filepath self.output_json_filename = output_json_filename self.save_path = self.output_json_filepath + '/' + self.output_json_filename self.dataset_name = dataset_name def load_model(self, model_path, model_name): if model_name == 'None': return None elif 'openai' in model_path: return OpenaiModel( model_name=model_name, base_url="https://api.ai.cs.ac.cn/v1", # base_url="https://openkey.cloud/v1", api_keys="sk-1ZZ2ij3jHwwLo8zN0ESQWflomiPAXhWcRgMlRmMPwbZchshy", generation_config={ 'max_tokens': self.max_new_tokens, } ) elif 'claude' in model_path: return AnthropicModel( model_name=model_name, base_url="https://api.ai.cs.ac.cn", api_keys="sk-1ZZ2ij3jHwwLo8zN0ESQWflomiPAXhWcRgMlRmMPwbZchshy", generation_config={ 'max_tokens': self.max_new_tokens, } ) elif 'gemini' in model_path: return GenaiModel( model_name=model_name, api_keys="AIzaSyBTD8Kv9KuUI2F35SvM2GOICoZIafvaze0", generation_config={ 'max_tokens': self.max_new_tokens, } ) else: return from_pretrained( model_name_or_path=model_path, model_name=model_name, max_new_tokens=self.max_new_tokens, ) def jailbreak(self, jailbreak_method): if jailbreak_method in jailbreak_method_dict.keys(): attacker = jailbreak_method_dict[jailbreak_method]( attack_model=self.attack_model, target_model=self.target_model, eval_model=self.eval_model, jailbreak_datasets=self.dataset, save_path=self.save_path, dataset_name=self.dataset_name ) attacker.attack() # attacker.log() else: # no jailbreak with open(self.save_path, 'w') as f: for instance in tqdm(self.dataset): if self.dataset_name == 'trustllm': self.target_model.set_system_message(instance.system_message) if jailbreak_method == 'ICA' and self.dataset_name == 'trustllm': if self.target_model_name == 'llama-2': instance.jailbreak_prompt = instance.context + f" {instance.query}" elif self.target_model_name == 'llama-3': instance.jailbreak_prompt = instance.context + f" {instance.query}" else: instance.jailbreak_prompt = instance.context + f" {instance.query}" elif self.dataset_name == 'enron': instance.jailbreak_prompt = instance.context + instance.query elif self.dataset_name == 'advbench': instance.jailbreak_prompt = instance.query + '\n' + instance.reference_responses[0] elif jailbreak_method == 'BlackBox': pass else: instance.jailbreak_prompt = instance.query # print(instance.jailbreak_prompt) answer = self.target_model.generate(instance.jailbreak_prompt) instance.target_responses.append(answer) line = instance.to_dict() f.write(json.dumps(line, ensure_ascii=False) + '\n') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_path', type=str, help='the path of the data file') parser.add_argument('--dataset_name', type=str, help='the name of the dataset') parser.add_argument('--dataset_version', type=str, help='the version of the dataset') parser.add_argument('--attack_model_path', type=str, help='the path of the attack model') parser.add_argument('--attack_model_name', type=str, help='the name of the attack model') parser.add_argument('--target_model_path', type=str, help='the path of the target model') parser.add_argument('--target_model_name', type=str, help='the name of the target model') parser.add_argument('--eval_model_path', type=str, help='the path of the eval model') parser.add_argument('--eval_model_name', type=str, help='the name of the eval model') parser.add_argument('--jailbreak_method', type=str, help='how to jailbreak the target model') parser.add_argument('--output_json_filepath', type=str, help='filepath of the output json file') parser.add_argument('--output_json_filename', type=str, help='filename of the output json file') args = parser.parse_args() print(args) hacker = Hacker( max_new_tokens=128, data_path=args.data_path, dataset_name=args.dataset_name, attack_model_name=args.attack_model_name, attack_model_path=args.attack_model_path, target_model_name=args.target_model_name, target_model_path=args.target_model_path, eval_model_name=args.eval_model_name, eval_model_path=args.eval_model_path, output_json_filepath=args.output_json_filepath, output_json_filename=args.output_json_filename, ) hacker.jailbreak(args.jailbreak_method)