forked from EleutherAI/cookbook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
calc_transformer_params.py
114 lines (104 loc) · 5.35 KB
/
calc_transformer_params.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# By Quentin Anthony and Beren Millidge
import argparse
import math
# Helper function to pretty-print message sizes
def convert_params(params):
if params == 0:
return "0"
size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y")
i = int(math.floor(math.log(params, 1000)))
p = math.pow(1000, i)
s = round(params / p, 2)
return "%s %s" % (s, size_name[i])
def config_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--vocab-size", "-v",
type=int,
default=51200,
help='Size of the vocab')
parser.add_argument("--tied-embeddings",
action="store_true",
help='Whether embeddings are tied (shared between input and output)')
parser.add_argument("--hidden-size", "-hs",
type=int,
default=6144,
help='Dimension of the model\'s hidden size')
parser.add_argument("--sequence-length", "-s",
type=int,
default=2048,
help='Sequence length used for training')
parser.add_argument("--num-layers", "-l",
type=int,
default=44,
help='Number of transformer layers used in model')
parser.add_argument("--moe",
action="store_true",
help='Whether our model is MoE')
parser.add_argument("--num-experts", "-e",
type=int,
default=8,
help='Number of experts for MoE')
parser.add_argument("--expert-interval", "-ei",
type=int,
default=1,
help='Expert interval for MoE')
parser.add_argument("--topk", "-t",
type=int,
default=1,
help='Top k routing for MoE')
parser.add_argument("--ffn-expansion-factor", "-ff",
type=int,
default=4,
help='How much the MLP hidden size expands')
parser.add_argument("--num-mlp-linears", "-nl",
type=int,
default=2,
help='How many linear layers per MLP block')
parser.add_argument("--kv-size-ratio", "-kv",
type=float,
default=1.0,
help='What fraction of num. query heads is num. key/value heads')
return parser
# calculates the params of a model given their hparams
def calc_params(args):
# Calculate embedding and unembedding params. If tied, re-use the same params
if args.tied_embeddings:
embedding_params = args.hidden_size * args.vocab_size
else:
embedding_params = 2 * args.hidden_size * args.vocab_size
position_embedding_params = args.hidden_size * args.sequence_length
# Each QKVO matrix is (hxh)
# Unless using GQA/MQA which makes K/V smaller
attention_params = int(2 * (1 + args.kv_size_ratio) * args.num_layers * args.hidden_size * args.hidden_size)
# (4*2)lh from the layernorm weights and biases for each of the QKV and mlp_in layernorms, 1h for the final layernorm.
# the extra 4lh is a mystery but we include it here
layernorm_params = 13 * args.num_layers * args.hidden_size
#ffn_params = 12 * args.num_layers * args.hidden_size * args.hidden_size
if args.moe:
# the number of layers that are MoE. (e.g. interval is 2 for GShard)
num_expert_layers = args.num_layers / args.expert_interval
# the number of FFN params for each MoE layer
ffn_expert_params = args.num_mlp_linears * args.ffn_expansion_factor * num_expert_layers * args.num_experts * args.hidden_size * args.hidden_size
# the number of FFN params for every dense layer
ffn_dense_params = args.num_mlp_linears * args.ffn_expansion_factor * (args.num_layers - num_expert_layers) * args.hidden_size * args.hidden_size
ffn_params = ffn_expert_params + ffn_dense_params
# the number of gating layer params assuming it's implemented as a simple linear layer
gating_params = num_expert_layers * args.hidden_size * args.num_experts
else:
# num_mlp_layers * (h x [ffn_expansion_factor * h]) FFN matrices
ffn_params = args.num_mlp_linears * args.ffn_expansion_factor * args.num_layers * args.hidden_size * args.hidden_size
total_params = embedding_params + attention_params + ffn_params + position_embedding_params + layernorm_params
if args.moe:
total_params += gating_params
print(f'Calculating number of parameters with training configuration: {vars(args)}\n')
print(f'Embedding parameters: {convert_params(embedding_params)}')
print(f'Attention parameters: {convert_params(attention_params)}')
print(f'FFN parameters: {convert_params(ffn_params)}')
if args.moe:
print(f'Gating parameters: {convert_params(gating_params)}')
print(f'Total Params in the Model: {convert_params(total_params)}')
if __name__ == "__main__":
print('\nExample with Fairseq-MoE 15B: python calc_transformer_params.py -l 12 -hs 768 --moe -e 512')
print('Example with GPT-3 175B: python calc_transformer_params.py -l 96 -hs 12288')
args = config_parser().parse_args()
calc_params(args)