forked from gpustack/gguf-parser-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_estimate_option.go
275 lines (247 loc) · 7.88 KB
/
file_estimate_option.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
package gguf_parser
import (
"slices"
"github.com/gpustack/gguf-parser-go/util/ptr"
)
type (
_LLaMACppRunEstimateOptions struct {
Architecture *GGUFArchitecture
Tokenizer *GGUFTokenizer
ContextSize *int32
InMaxContextSize bool
LogicalBatchSize *int32
PhysicalBatchSize *int32
ParallelSize *int32
CacheKeyType *GGMLType
CacheValueType *GGMLType
OffloadKVCache *bool
OffloadLayers *uint64
FlashAttention bool
SplitMode LLaMACppSplitMode
TensorSplitFraction []float64
MainGPUIndex int
RPCServers []string
Projector *LLaMACppRunEstimate
Drafter *LLaMACppRunEstimate
Adapters []LLaMACppRunEstimate
DeviceMetrics []LLaMACppRunDeviceMetric
}
// LLaMACppRunDeviceMetric holds the device metric for the estimate.
//
// When the device represents a CPU,
// FLOPS refers to the floating-point operations per second of that CPU,
// while UpBandwidth indicates the bandwidth of the RAM (since SRAM is typically small and cannot hold all weights,
// the RAM here refers to the bandwidth of DRAM,
// unless the device's SRAM can accommodate the corresponding model weights).
//
// When the device represents a GPU,
// FLOPS refers to the floating-point operations per second of that GPU,
// while UpBandwidth indicates the bandwidth of the VRAM.
//
// When the device represents a specific node,
// FLOPS depends on whether a CPU or GPU is being used,
// while UpBandwidth refers to the network bandwidth between nodes.
LLaMACppRunDeviceMetric struct {
// FLOPS is the floating-point operations per second of the device.
FLOPS FLOPSScalar
// UpBandwidth is the bandwidth of the device to transmit data to calculate,
// unit is Bps (bytes per second).
UpBandwidth BytesPerSecondScalar
// DownBandwidth is the bandwidth of the device to transmit calculated result to next layer,
// unit is Bps (bytes per second).
DownBandwidth BytesPerSecondScalar
}
// LLaMACppRunEstimateOption is the options for the estimate.
LLaMACppRunEstimateOption func(*_LLaMACppRunEstimateOptions)
)
// WithArchitecture sets the architecture for the estimate.
//
// Allows reusing the same GGUFArchitecture for multiple estimates.
func WithArchitecture(arch GGUFArchitecture) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.Architecture = &arch
}
}
// WithTokenizer sets the tokenizer for the estimate.
//
// Allows reusing the same GGUFTokenizer for multiple estimates.
func WithTokenizer(tokenizer GGUFTokenizer) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.Tokenizer = &tokenizer
}
}
// WithContextSize sets the context size for the estimate.
func WithContextSize(size int32) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if size <= 0 {
return
}
o.ContextSize = &size
}
}
// WithinMaxContextSize limits the context size to the maximum,
// if the context size is over the maximum.
func WithinMaxContextSize() LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.InMaxContextSize = true
}
}
// WithLogicalBatchSize sets the logical batch size for the estimate.
func WithLogicalBatchSize(size int32) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if size <= 0 {
return
}
o.LogicalBatchSize = &size
}
}
// WithPhysicalBatchSize sets the physical batch size for the estimate.
func WithPhysicalBatchSize(size int32) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if size <= 0 {
return
}
o.PhysicalBatchSize = &size
}
}
// WithParallelSize sets the (decoding sequences) parallel size for the estimate.
func WithParallelSize(size int32) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if size <= 0 {
return
}
o.ParallelSize = &size
}
}
// _GGUFEstimateCacheTypeAllowList is the allow list of cache key and value types.
var _GGUFEstimateCacheTypeAllowList = []GGMLType{
GGMLTypeF32,
GGMLTypeF16,
GGMLTypeQ8_0,
GGMLTypeQ4_0, GGMLTypeQ4_1,
GGMLTypeIQ4_NL,
GGMLTypeQ5_0, GGMLTypeQ5_1,
}
// WithCacheKeyType sets the cache key type for the estimate.
func WithCacheKeyType(t GGMLType) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if slices.Contains(_GGUFEstimateCacheTypeAllowList, t) {
o.CacheKeyType = &t
}
}
}
// WithCacheValueType sets the cache value type for the estimate.
func WithCacheValueType(t GGMLType) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if slices.Contains(_GGUFEstimateCacheTypeAllowList, t) {
o.CacheValueType = &t
}
}
}
// WithoutOffloadKVCache disables offloading the KV cache.
func WithoutOffloadKVCache() LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.OffloadKVCache = ptr.To(false)
}
}
// WithOffloadLayers sets the number of layers to offload.
func WithOffloadLayers(layers uint64) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.OffloadLayers = &layers
}
}
// WithFlashAttention sets the flash attention flag.
func WithFlashAttention() LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.FlashAttention = true
}
}
// LLaMACppSplitMode is the split mode for LLaMACpp.
type LLaMACppSplitMode uint
const (
LLaMACppSplitModeLayer LLaMACppSplitMode = iota
LLaMACppSplitModeRow
LLaMACppSplitModeNone
_LLAMACppSplitModeMax
)
// WithSplitMode sets the split mode for the estimate.
func WithSplitMode(mode LLaMACppSplitMode) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if mode < _LLAMACppSplitModeMax {
o.SplitMode = mode
}
}
}
// WithTensorSplitFraction sets the tensor split cumulative fractions for the estimate.
//
// WithTensorSplitFraction accepts a variadic number of fractions,
// all fraction values must be in the range of [0, 1],
// and the last fraction must be 1.
//
// For example, WithTensorSplitFraction(0.2, 0.4, 0.6, 0.8, 1) will split the tensor into five parts with 20% each.
func WithTensorSplitFraction(fractions []float64) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if len(fractions) == 0 {
return
}
for _, f := range fractions {
if f < 0 || f > 1 {
return
}
}
if fractions[len(fractions)-1] != 1 {
return
}
o.TensorSplitFraction = fractions
}
}
// WithMainGPUIndex sets the main device for the estimate.
//
// When split mode is LLaMACppSplitModeNone, the main device is the only device.
// When split mode is LLaMACppSplitModeRow, the main device handles the intermediate results and KV.
//
// WithMainGPUIndex only works when TensorSplitFraction is set.
func WithMainGPUIndex(di int) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.MainGPUIndex = di
}
}
// WithRPCServers sets the RPC servers for the estimate.
func WithRPCServers(srvs []string) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if len(srvs) == 0 {
return
}
o.RPCServers = srvs
}
}
// WithDrafter sets the drafter estimate usage.
func WithDrafter(dft *LLaMACppRunEstimate) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.Drafter = dft
}
}
// WithProjector sets the multimodal projector estimate usage.
func WithProjector(prj *LLaMACppRunEstimate) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
o.Projector = prj
}
}
// WithAdapters sets the adapters estimate usage.
func WithAdapters(adp []LLaMACppRunEstimate) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if len(adp) == 0 {
return
}
o.Adapters = adp
}
}
// WithDeviceMetrics sets the device metrics for the estimate.
func WithDeviceMetrics(metrics []LLaMACppRunDeviceMetric) LLaMACppRunEstimateOption {
return func(o *_LLaMACppRunEstimateOptions) {
if len(metrics) == 0 {
return
}
o.DeviceMetrics = metrics
}
}