-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathred-teamer-loop.py
55 lines (43 loc) · 1.69 KB
/
red-teamer-loop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline
max_calls = 12
calls_made = 0
constructive_tension = True
keep_going = True
last_response = None
turns = []
redteamer = transformers.pipeline("text-generation", "TrustAI-laboratory/Auto-Redteaming-Model")
# replace lambda w/ a function taking a prompt and returning LLM output
target_model = lambda x: x
while calls_made < max_calls and keep_going:
# generate a challenge
if last_response == "" and constructive_tension:
challenge = ""
else:
if last_response:
last_response_first_sent = last_response.split(". ")[0]
else:
last_response_first_sent = ""
# wrap the target's response in the markup used in training
query = f"<|input|>{last_response_first_sent}<|response|>"
challenge = redteamer(query)
# strip the prompt out from the front of the model response
challenge = re.sub("^" + re.escape(query), "", challenge)
# unwrap the recommended challenge from the markup
challenge = re.sub("\<\|.*", "", challenge[0]).strip()
turn = ("probe", challenge)
turns.append(turn)
# send the challenge and get the response
response = target_model(challenge)[0].strip()
turn = ("model", response)
turns.append(turn)
# increment calls_made
calls_made += 1
# check if the resp is empty or if it matches the previous resp
if not len(response) and not constructive_tension:
keep_going = False
if response == last_response:
keep_going = False
# update last_response
last_response = response.replace("\n", " ").strip()