-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathOptimizeForArchitecture.cmake
358 lines (352 loc) · 18.3 KB
/
OptimizeForArchitecture.cmake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
include (${CMAKE_SOURCE_DIR}/AddCompilerFlag.cmake)
include (${CMAKE_SOURCE_DIR}/MacroEnsureVersion.cmake)
macro(_my_find _list _value _ret)
list(FIND ${_list} "${_value}" _found)
if(_found EQUAL -1)
set(${_ret} FALSE)
else(_found EQUAL -1)
set(${_ret} TRUE)
endif(_found EQUAL -1)
endmacro(_my_find)
macro(OptimizeForArchitecture)
set(TARGET_ARCHITECTURE "auto" CACHE STRING "CPU architecture to optimize for. Using an incorrect setting here can result in crashes of the resulting binary because of invalid instructions used.\nSetting the value to \"auto\" will try to optimize for the architecture where cmake is called.\nOther supported values are: \"generic\", \"core\", \"merom\" (65nm Core2), \"penryn\" (45nm Core2), \"nehalem\", \"westmere\", \"sandy-bridge\", \"atom\", \"k8\", \"k8-sse3\", \"barcelona\", \"istanbul\", \"magny-cours\", \"bulldozer\", \"interlagos\".")
set(_force)
if(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}")
message(STATUS "${TARGET_ARCHITECTURE} changed")
set(_force FORCE)
endif(NOT _last_target_arch STREQUAL "${TARGET_ARCHITECTURE}")
set(_last_target_arch "${TARGET_ARCHITECTURE}" CACHE STRING "" FORCE)
mark_as_advanced(_last_target_arch)
string(TOLOWER "${TARGET_ARCHITECTURE}" TARGET_ARCHITECTURE)
set(_march_flag_list)
set(_available_vector_units_list)
if(TARGET_ARCHITECTURE STREQUAL "auto")
set(TARGET_ARCHITECTURE "generic")
set(_vendor_id)
set(_cpu_family)
set(_cpu_model)
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
file(READ "/proc/cpuinfo" _cpuinfo)
string(REGEX REPLACE ".*vendor_id[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _vendor_id "${_cpuinfo}")
string(REGEX REPLACE ".*cpu family[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_family "${_cpuinfo}")
string(REGEX REPLACE ".*model[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_model "${_cpuinfo}")
string(REGEX REPLACE ".*flags[ \t]*:[ \t]+([a-zA-Z0-9_-]+).*" "\\1" _cpu_flags "${_cpuinfo}")
elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
exec_program("/usr/sbin/sysctl -n machdep.cpu.vendor" OUTPUT_VARIABLE _vendor_id)
exec_program("/usr/sbin/sysctl -n machdep.cpu.model" OUTPUT_VARIABLE _cpu_model)
exec_program("/usr/sbin/sysctl -n machdep.cpu.family" OUTPUT_VARIABLE _cpu_family)
exec_program("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE _cpu_flags)
string(TOLOWER "${_cpu_flags}" _cpu_flags)
string(REPLACE "." "_" _cpu_flags "${_cpu_flags}")
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE)
get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE)
mark_as_advanced(_vendor_id _cpu_id)
string(REGEX REPLACE ".* Family ([0-9]+) .*" "\\1" _cpu_family "${_cpu_id}")
string(REGEX REPLACE ".* Model ([0-9]+) .*" "\\1" _cpu_model "${_cpu_id}")
message(AUTHOR_WARNING "If you know how to query the processor capabilities wrt. SSE on Windows, let me know!")
endif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
if(_vendor_id STREQUAL "GenuineIntel")
if(_cpu_family EQUAL 6)
# Any recent Intel CPU except NetBurst
if(_cpu_model EQUAL 58) # Xeon 7500 series
set(TARGET_ARCHITECTURE "ivy-bridge")
elseif(_cpu_model EQUAL 46) # Xeon 7500 series
set(TARGET_ARCHITECTURE "westmere")
elseif(_cpu_model EQUAL 45) # Xeon TNG
set(TARGET_ARCHITECTURE "sandy-bridge")
elseif(_cpu_model EQUAL 44) # Xeon 5600 series
set(TARGET_ARCHITECTURE "westmere")
elseif(_cpu_model EQUAL 42) # Core TNG
set(TARGET_ARCHITECTURE "sandy-bridge")
elseif(_cpu_model EQUAL 37) # Core i7/i5/i3
set(TARGET_ARCHITECTURE "westmere")
elseif(_cpu_model EQUAL 31) # Core i7/i5
set(TARGET_ARCHITECTURE "westmere")
elseif(_cpu_model EQUAL 30) # Core i7/i5
set(TARGET_ARCHITECTURE "westmere")
elseif(_cpu_model EQUAL 29)
set(TARGET_ARCHITECTURE "penryn")
elseif(_cpu_model EQUAL 28)
set(TARGET_ARCHITECTURE "atom")
elseif(_cpu_model EQUAL 26)
set(TARGET_ARCHITECTURE "nehalem")
elseif(_cpu_model EQUAL 23)
set(TARGET_ARCHITECTURE "penryn")
elseif(_cpu_model EQUAL 15)
set(TARGET_ARCHITECTURE "merom")
elseif(_cpu_model EQUAL 14)
set(TARGET_ARCHITECTURE "core")
elseif(_cpu_model LESS 14)
message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the generic CPU settings with SSE2.")
set(TARGET_ARCHITECTURE "generic")
else()
message(WARNING "Your CPU (family ${_cpu_family}, model ${_cpu_model}) is not known. Auto-detection of optimization flags failed and will use the 65nm Core 2 CPU settings.")
set(TARGET_ARCHITECTURE "merom")
endif()
elseif(_cpu_family EQUAL 7) # Itanium (not supported)
message(WARNING "Your CPU (Itanium: family ${_cpu_family}, model ${_cpu_model}) is not supported by OptimizeForArchitecture.cmake.")
elseif(_cpu_family EQUAL 15) # NetBurst
list(APPEND _available_vector_units_list "sse" "sse2")
if(_cpu_model GREATER 2) # Not sure whether this must be 3 or even 4 instead
list(APPEND _available_vector_units_list "sse" "sse2" "sse3")
endif(_cpu_model GREATER 2)
endif(_cpu_family EQUAL 6)
elseif(_vendor_id STREQUAL "AuthenticAMD")
if(_cpu_family EQUAL 21) # 15h
set(TARGET_ARCHITECTURE "bulldozer")
elseif(_cpu_family EQUAL 20) # 14h
elseif(_cpu_family EQUAL 18) # 12h
elseif(_cpu_family EQUAL 16) # 10h
set(TARGET_ARCHITECTURE "barcelona")
elseif(_cpu_family EQUAL 15)
set(TARGET_ARCHITECTURE "k8")
if(_cpu_model GREATER 64) # I don't know the right number to put here. This is just a guess from the hardware I have access to
set(TARGET_ARCHITECTURE "k8-sse3")
endif(_cpu_model GREATER 64)
endif()
endif(_vendor_id STREQUAL "GenuineIntel")
message(STATUS "Detected CPU: ${TARGET_ARCHITECTURE}")
endif(TARGET_ARCHITECTURE STREQUAL "auto")
if(TARGET_ARCHITECTURE STREQUAL "core")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3")
elseif(TARGET_ARCHITECTURE STREQUAL "merom")
list(APPEND _march_flag_list "merom")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3")
elseif(TARGET_ARCHITECTURE STREQUAL "penryn")
list(APPEND _march_flag_list "penryn")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3")
message(STATUS "Sadly the Penryn architecture exists in variants with SSE4.1 and without SSE4.1.")
if(_cpu_flags MATCHES "sse4_1")
message(STATUS "SSE4.1: enabled (auto-detected from this computer's CPU flags)")
list(APPEND _available_vector_units_list "sse4.1")
else()
message(STATUS "SSE4.1: disabled (auto-detected from this computer's CPU flags)")
endif()
elseif(TARGET_ARCHITECTURE STREQUAL "nehalem")
list(APPEND _march_flag_list "nehalem")
list(APPEND _march_flag_list "corei7")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2")
elseif(TARGET_ARCHITECTURE STREQUAL "westmere")
list(APPEND _march_flag_list "westmere")
list(APPEND _march_flag_list "corei7")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2")
elseif(TARGET_ARCHITECTURE STREQUAL "sandy-bridge")
list(APPEND _march_flag_list "sandybridge")
list(APPEND _march_flag_list "corei7-avx")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx")
elseif(TARGET_ARCHITECTURE STREQUAL "ivy-bridge")
list(APPEND _march_flag_list "ivybridge")
list(APPEND _march_flag_list "sandybridge")
list(APPEND _march_flag_list "corei7-avx")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4.1" "sse4.2" "avx" "avxi")
elseif(TARGET_ARCHITECTURE STREQUAL "atom")
list(APPEND _march_flag_list "atom")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3")
elseif(TARGET_ARCHITECTURE STREQUAL "k8")
list(APPEND _march_flag_list "k8")
list(APPEND _available_vector_units_list "sse" "sse2")
elseif(TARGET_ARCHITECTURE STREQUAL "k8-sse3")
list(APPEND _march_flag_list "k8-sse3")
list(APPEND _march_flag_list "k8")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3")
elseif(TARGET_ARCHITECTURE STREQUAL "interlagos")
list(APPEND _march_flag_list "bulldozer")
list(APPEND _march_flag_list "barcelona")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4")
elseif(TARGET_ARCHITECTURE STREQUAL "bulldozer")
list(APPEND _march_flag_list "bulldozer")
list(APPEND _march_flag_list "barcelona")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "ssse3" "sse4a" "sse4.1" "sse4.2" "avx" "xop" "fma4")
elseif(TARGET_ARCHITECTURE STREQUAL "barcelona")
list(APPEND _march_flag_list "barcelona")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a")
elseif(TARGET_ARCHITECTURE STREQUAL "istanbul")
list(APPEND _march_flag_list "barcelona")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a")
elseif(TARGET_ARCHITECTURE STREQUAL "magny-cours")
list(APPEND _march_flag_list "barcelona")
list(APPEND _march_flag_list "core2")
list(APPEND _available_vector_units_list "sse" "sse2" "sse3" "sse4a")
elseif(TARGET_ARCHITECTURE STREQUAL "generic")
list(APPEND _march_flag_list "generic")
else(TARGET_ARCHITECTURE STREQUAL "core")
message(FATAL_ERROR "Unknown target architecture: \"${TARGET_ARCHITECTURE}\". Please set TARGET_ARCHITECTURE to a supported value.")
endif(TARGET_ARCHITECTURE STREQUAL "core")
set(_disable_vector_unit_list)
set(_enable_vector_unit_list)
_my_find(_available_vector_units_list "sse2" SSE2_FOUND)
_my_find(_available_vector_units_list "sse3" SSE3_FOUND)
_my_find(_available_vector_units_list "ssse3" SSSE3_FOUND)
_my_find(_available_vector_units_list "sse4.1" SSE4_1_FOUND)
_my_find(_available_vector_units_list "sse4.2" SSE4_2_FOUND)
_my_find(_available_vector_units_list "sse4a" SSE4a_FOUND)
_my_find(_available_vector_units_list "avxi" AVXI_FOUND)
_my_find(_available_vector_units_list "avx" AVX_FOUND)
_my_find(_available_vector_units_list "xop" XOP_FOUND)
_my_find(_available_vector_units_list "fma4" FMA4_FOUND)
set(USE_SSE2 ${SSE2_FOUND} CACHE BOOL "Use SSE2. If SSE2 instructions are not enabled the SSE implementation will be disabled." ${_force})
set(USE_SSE3 ${SSE3_FOUND} CACHE BOOL "Use SSE3. If SSE3 instructions are not enabled they will be emulated." ${_force})
set(USE_SSSE3 ${SSSE3_FOUND} CACHE BOOL "Use SSSE3. If SSSE3 instructions are not enabled they will be emulated." ${_force})
set(USE_SSE4_1 ${SSE4_1_FOUND} CACHE BOOL "Use SSE4.1. If SSE4.1 instructions are not enabled they will be emulated." ${_force})
set(USE_SSE4_2 ${SSE4_2_FOUND} CACHE BOOL "Use SSE4.2. If SSE4.2 instructions are not enabled they will be emulated." ${_force})
set(USE_SSE4a ${SSE4a_FOUND} CACHE BOOL "Use SSE4a. If SSE4a instructions are not enabled they will be emulated." ${_force})
set(USE_AVX ${AVX_FOUND} CACHE BOOL "Use AVX. This will double some of the vector sizes relative to SSE." ${_force})
set(USE_AVXI ${AVXI_FOUND} CACHE BOOL "Use AVX-I. This will double some of the vector sizes relative to SSE." ${_force})
set(USE_XOP ${XOP_FOUND} CACHE BOOL "Use XOP." ${_force})
set(USE_FMA4 ${FMA4_FOUND} CACHE BOOL "Use FMA4." ${_force})
mark_as_advanced(USE_SSE2 USE_SSE3 USE_SSSE3 USE_SSE4_1 USE_SSE4_2 USE_SSE4a USE_AVX USE_AVXI)
if(USE_SSE2)
list(APPEND _enable_vector_unit_list "sse2")
else(USE_SSE2)
list(APPEND _disable_vector_unit_list "sse2")
endif(USE_SSE2)
if(USE_SSE3)
list(APPEND _enable_vector_unit_list "sse3")
else(USE_SSE3)
list(APPEND _disable_vector_unit_list "sse3")
endif(USE_SSE3)
if(USE_SSSE3)
list(APPEND _enable_vector_unit_list "ssse3")
else(USE_SSSE3)
list(APPEND _disable_vector_unit_list "ssse3")
endif(USE_SSSE3)
if(USE_SSE4_1)
list(APPEND _enable_vector_unit_list "sse4.1")
else(USE_SSE4_1)
list(APPEND _disable_vector_unit_list "sse4.1")
endif(USE_SSE4_1)
if(USE_SSE4_2)
list(APPEND _enable_vector_unit_list "sse4.2")
else(USE_SSE4_2)
list(APPEND _disable_vector_unit_list "sse4.2")
endif(USE_SSE4_2)
if(USE_SSE4a)
list(APPEND _enable_vector_unit_list "sse4a")
else(USE_SSE4a)
list(APPEND _disable_vector_unit_list "sse4a")
endif(USE_SSE4a)
if(USE_AVX)
list(APPEND _enable_vector_unit_list "avx")
# we want SSE intrinsics to result in instructions using the VEX prefix.
# Otherwise integer ops (which require the older SSE intrinsics) would
# always have a large penalty.
list(APPEND _enable_vector_unit_list "sse2avx")
else(USE_AVX)
list(APPEND _disable_vector_unit_list "avx")
endif(USE_AVX)
if(USE_AVXI)
list(APPEND _enable_vector_unit_list "avxi")
# we want SSE intrinsics to result in instructions using the VEX prefix.
# Otherwise integer ops (which require the older SSE intrinsics) would
# always have a large penalty.
list(APPEND _enable_vector_unit_list "sse2avx")
else(USE_AVXI)
list(APPEND _disable_vector_unit_list "avxi")
endif(USE_AVXI)
if(USE_XOP)
list(APPEND _enable_vector_unit_list "xop")
else()
list(APPEND _disable_vector_unit_list "xop")
endif()
if(USE_FMA4)
list(APPEND _enable_vector_unit_list "fma4")
else()
list(APPEND _disable_vector_unit_list "fma4")
endif()
if(CMAKE_C_COMPILER MATCHES "cl(.exe)?$") # MSVC
# MSVC on 32 bit can select only /arch:SSE2
# MSVC on 64 bit cannot select anything
if(NOT CMAKE_CL_64)
_my_find(_enable_vector_unit_list "sse2" _found)
if(_found)
AddCompilerFlag("/arch:SSE2")
endif()
endif(NOT CMAKE_CL_64)
foreach(_flag ${_enable_vector_unit_list})
string(TOUPPER "${_flag}" _flag)
string(REPLACE "." "_" _flag "__${_flag}__")
add_definitions("-D${_flag}")
endforeach(_flag)
elseif(CMAKE_CXX_COMPILER MATCHES "/(icpc|icc)$") # ICC
_my_find(_available_vector_units_list "avxi" _found)
if(_found)
AddCompilerFlag("-xCORE-AVX-I")
else(_found)
_my_find(_available_vector_units_list "avx" _found)
AddCompilerFlag("-xAVX")
else(_found)
_my_find(_available_vector_units_list "sse4.2" _found)
if(_found)
AddCompilerFlag("-xSSE4.2")
else(_found)
_my_find(_available_vector_units_list "sse4.1" _found)
if(_found)
AddCompilerFlag("-xSSE4.1")
else(_found)
_my_find(_available_vector_units_list "ssse3" _found)
if(_found)
AddCompilerFlag("-xSSSE3")
else(_found)
_my_find(_available_vector_units_list "sse3" _found)
if(_found)
# If the target host is an AMD machine then we still want to use -xSSE2 because the binary would refuse to run at all otherwise
_my_find(_march_flag_list "barcelona" _found)
if(NOT _found)
_my_find(_march_flag_list "k8-sse3" _found)
endif(NOT _found)
if(_found)
AddCompilerFlag("-xSSE2")
else(_found)
AddCompilerFlag("-xSSE3")
endif(_found)
else(_found)
_my_find(_available_vector_units_list "sse2" _found)
if(_found)
AddCompilerFlag("-xSSE2")
endif(_found)
endif(_found)
endif(_found)
endif(_found)
endif(_found)
endif(_found)
else(CMAKE_C_COMPILER MATCHES "cl(.exe)?$")
foreach(_flag ${_march_flag_list})
AddCompilerFlag("-march=${_flag}" _good)
if(_good)
break()
endif(_good)
endforeach(_flag)
foreach(_flag ${_enable_vector_unit_list})
AddCompilerFlag("-m${_flag}")
endforeach(_flag)
foreach(_flag ${_disable_vector_unit_list})
AddCompilerFlag("-mno-${_flag}")
endforeach(_flag)
# Not really target architecture specific, but GCC 4.5.[01] fail at inlining some functions,
# creating functions with a single instructions, thus a large overhead. This is a good
# (because central) place to fix the problem
if(CMAKE_COMPILER_IS_GNUCXX)
exec_program(${CMAKE_C_COMPILER} ARGS -dumpversion OUTPUT_VARIABLE _gcc_version)
macro_ensure_version("4.5.0" "${_gcc_version}" GCC_4_5_0)
if(GCC_4_5_0)
macro_ensure_version("4.5.2" "${_gcc_version}" GCC_4_5_2)
if(NOT GCC_4_5_2)
AddCompilerFlag("--param early-inlining-insns=12")
endif(NOT GCC_4_5_2)
endif(GCC_4_5_0)
endif(CMAKE_COMPILER_IS_GNUCXX)
endif(CMAKE_C_COMPILER MATCHES "cl(.exe)?$")
endmacro(OptimizeForArchitecture)