-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdump_sims.py
86 lines (77 loc) · 3.96 KB
/
dump_sims.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import csv
import pathlib
import torch
import tqdm
import transformers as hf
from extract_marianmt import Decomposer
def spim(decomposition, total_embs=None):
if total_embs is None:
total_embs = decomposition.sum(-2)
norms = torch.linalg.norm(total_embs, dim=-1, keepdims=True) ** 2
scalar_prods = torch.einsum('lstd,lsd->lst', decomposition, total_embs)
sims = scalar_prods / norms
assert torch.allclose(sims.sum(-1), torch.ones_like(sims.sum(-1)))
return sims
def l2(decomposition, total_embs=None):
if total_embs is None:
total_embs = decomposition.sum(-2)
total_embs = total_embs.unsqueeze(-2)
return torch.linalg.norm(decomposition - total_embs, dim=-1)
def cosine(decomposition, total_embs=None):
if total_embs is None:
total_embs = decomposition.sum(-2)
tgt_norms = torch.linalg.norm(total_embs, dim=-1, keepdims=True)
dcp_norms = torch.linalg.norm(decomposition, dim=-1)
scalar_prods = torch.einsum('lstd,lsd->lst', decomposition, total_embs)
norms = tgt_norms * dcp_norms
return scalar_prods / norms
def norm_ratio(decomposition, total_embs=None):
if total_embs is None:
total_embs = decomposition.sum(-2)
tgt_norms = torch.linalg.norm(total_embs, dim=-1, keepdims=True)
dcp_norms = torch.linalg.norm(decomposition, dim=-1)
return dcp_norms / tgt_norms
def spims_from_files(decomposer, source_file, target_file, dump_file):
with open(source_file) as src, open(target_file) as tgt, open(dump_file, 'w') as ostr:
src, tgt = src.readlines(), tgt.readlines()
assert len(src) == len(tgt)
writer = csv.writer(ostr)
_ = writer.writerow(['tok_idx', 'layer_idx', 'I', 'S', 'T', 'F', 'C', 'func', 'source', 'target'])
for source, target in tqdm.tqdm(zip(src, tgt), total=len(src), desc='dcp'):
source, target = source.strip(), target.strip()
decomposition = decomposer(source, target, last_layer_only=False)
total_embs = decomposition.sum(-2)
for sim_func in spim, cosine, l2, norm_ratio:
sims = sim_func(decomposition, total_embs=total_embs)
assert sims.size() == decomposition.size()[:-1]
for layer_idx in range(sims.size(0)):
for tok_idx in range(sims.size(1)):
I, S, T, F, C = sims[layer_idx,tok_idx].tolist()
_ = writer.writerow([tok_idx, layer_idx, I, S, T, F, C, sim_func.__name__, source, target])
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser('parallel corpus to CSV of spim')
parser.add_argument('--model', type=str, required=True)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--src', type=pathlib.Path, required=True)
parser.add_argument('--tgt', type=pathlib.Path, required=True)
parser.add_argument('--csv', type=pathlib.Path, required=True)
parser.add_argument('--do_generate', action='store_true')
parser.add_argument('--device', type=torch.device, default=torch.device('cuda'))
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
if args.do_generate:
model_mt = hf.MarianMTModel.from_pretrained(args.model).to(device=args.device, dtype=torch.float64)
model_mt.eval()
tokenizer = hf.MarianTokenizer.from_pretrained(args.tokenizer)
with open(args.src) as istr:
source_sentences = list(map(str.strip, istr))
with open(args.tgt, 'w') as ostr:
for source in tqdm.tqdm(source_sentences, desc='gen'):
inputs = tokenizer(source, return_tensors='pt').to(device=args.device)
translation = model_mt.generate(**inputs)
print(tokenizer.decode(translation[0], skip_special_tokens=True), file=ostr)
del model_mt, tokenizer
decomposer = Decomposer(args.model, args.tokenizer, device=args.device)
spims_from_files(decomposer, args.src, args.tgt, args.csv)