-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy path03-save-model-weights.py
96 lines (79 loc) · 4.1 KB
/
03-save-model-weights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = "Qwen/Qwen2-7B-Instruct"
SKIP_BEGIN_LAYERS=1
SKIP_END_LAYERS=0
SCALE_FACTOR = 1.0
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Reload the model in CPU memory with bfloat16 data type
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
device_map='cpu'
)
model.requires_grad_(False)
# Load your pre-computed direction tensor
refusal_dir = torch.load(MODEL_ID.replace("/", "_") + "_refusal_dir.pt")
refusal_dir = refusal_dir.to(model.device)
# Get the language model component and check it's as expected.
lm_model = model.model
assert hasattr(lm_model, 'layers'), "The model does not have the expected structure."
# Check the ranges are valid.
num_layers = len(lm_model.layers)
assert SKIP_BEGIN_LAYERS >= 0, "SKIP_BEGIN_LAYERS must be >= 0."
assert SKIP_END_LAYERS >= 0, "SKIP_END_LAYERS must be >= 0."
assert SKIP_BEGIN_LAYERS + SKIP_END_LAYERS < num_layers, "SKIP_BEGIN_LAYERS + SKIP_END_LAYERS must be < num_layers."
bar_layers = tqdm(total= (num_layers - (SKIP_BEGIN_LAYERS + SKIP_END_LAYERS)) * 2, desc = "Modifying tensors")
# Cast any ops performed on CPU up to float32... If you have newer CPU might be able to use bfloat16 for this.
# NOTE: Use a negative scale_factor to "induce" and a positive scale_factor of < 1 to "ablate" less.
def modify_tensor(tensor_data, refusal_dir, scale_factor: float = 1.0):
assert scale_factor <= 1.0, "Using a scale_factor of > 1 doesn't make sense..."
tensor_float32 = tensor_data.to(torch.float32)
refusal_dir_float32 = refusal_dir.to(torch.float32)
# Ensure refusal_dir is a 1-dimensional tensor
if refusal_dir_float32.dim() > 1:
refusal_dir_float32 = refusal_dir_float32.view(-1)
tensor_float32 -= scale_factor * torch.matmul(torch.outer(refusal_dir_float32, refusal_dir_float32), tensor_float32)
tensor_modified = tensor_float32.to(torch.bfloat16)
bar_layers.update(1)
return torch.nn.Parameter(tensor_modified)
# Modify the 'self_attn.o_proj.weight' and 'mlp.down_proj.weight' in each chosen layer.
# NOTE: These tensors names are specific to "Qwen2" and may need changing.
# - See here for others: https://github.com/arcee-ai/mergekit/tree/main/mergekit/_data/architectures
for layer_idx in range(SKIP_BEGIN_LAYERS, num_layers - SKIP_END_LAYERS):
lm_model.layers[layer_idx].self_attn.o_proj.weight = modify_tensor(
lm_model.layers[layer_idx].self_attn.o_proj.weight.data, refusal_dir, SCALE_FACTOR
)
lm_model.layers[layer_idx].mlp.down_proj.weight = modify_tensor(
lm_model.layers[layer_idx].mlp.down_proj.weight.data, refusal_dir, SCALE_FACTOR
)
bar_layers.close()
# Save the modified model and original tokenizer
print("Saving modified model (with original tokenizer)...")
model.save_pretrained("modified_model")
tokenizer.save_pretrained("modified_model")
'''
Dev Notes
---
Code adapted from: https://github.com/Sumandora/remove-refusals-with-transformers/issues/1
Model Reference:
https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2/modeling_qwen2.py
https://huggingface.co/augmxnt/Qwen2-7B-Instruct-deccp/blob/main/model.safetensors.index.json
https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
Future reference:
https://chatgpt.com/c/f176e037-9638-4c33-b8f2-597aab09bddd
https://chatgpt.com/c/7b3355ad-9a4d-4e44-86d7-4ff1e1f1eeb3
https://chatgpt.com/c/25da5e7c-80c0-4b3a-8c46-f113e7dd509a
https://chatgpt.com/c/4edcc052-3f26-4aa5-a5f4-33902a2d2849
https://claude.ai/chat/f5c84631-466b-40a2-bd5b-d709b24709ce
https://claude.ai/chat/013885ce-b87b-4fca-8d88-b3c1e1091cee
https://claude.ai/chat/cf64e2ea-6da5-4900-bd9b-4cb0300e26ee
https://claude.ai/chat/8643e495-3ac3-403c-b6af-e836c057ff9e
https://claude.ai/chat/9c9d264b-9a93-440b-bdd6-9d067db91b1f
https://chat.mistral.ai/chat/15919370-83af-4562-bf84-c6a67ea7597d
https://chat.mistral.ai/chat/73a78a2e-0c4d-4a9d-8ac4-b0a2c537a6ee
FINALLY WORKED: https://chat.mistral.ai/chat/d3731f9c-4952-4be7-9704-e7ddab86df66
'''