diff --git a/Makefile.config b/Makefile.config
index 7b2af458..1c1be39f 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -75,8 +75,8 @@ ifdef AVX_512
 VEC_GCC  := -mavx512f -mavx512cd # -march=native -fopt-info-vec -mavx
 VEC_ICC  := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
 else ifdef AVX2
-VEC_GCC  := -mavx2
-VEC_ICC  := -mavx2
+VEC_GCC  := -mavx2 -mfma
+VEC_ICC  := -mavx2 -mfma
 else
 VEC_GCC  := -mavx # -fopt-info-vec-all
 VEC_ICC  := -mavx
@@ -150,11 +150,11 @@ ifdef USE_CUDA
 	CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
 	LDFLAGS_HOST += -L${CUDALIBDIR}
 	ifeq ($(CXX),icpc)
-	  CXXFLAGS += -qopenmp
-	  LDFLAGS  += -qopenmp
+	  CXXFLAGS += -qopenmp-simd
+	  LDFLAGS  += -qopenmp-simd
 	else
-	  CXXFLAGS += -fopenmp
-	  LDFLAGS  += -fopenmp
+	  CXXFLAGS += -fopenmp-simd
+	  LDFLAGS  += -fopenmp-simd
 	endif
 endif
 #CXXFLAGS += -qopenmp
@@ -177,14 +177,14 @@ endif
 
 ifeq (${CXX}, ${ICC})
   VEC_HOST := ${VEC_ICC}
-  CXXFLAGS += -qopt-report=5 -qopenmp -qopt-report-phase=all
+  CXXFLAGS += -qopt-report=5 -qopenmp-simd -qopt-report-phase=all
 else
   VEC_HOST := ${VEC_GCC}
 endif
 
 ifeq ($(CXX), g++)
   CXXFLAGS += -std=c++1z -ftree-vectorize -Werror=main -Werror=pointer-arith -Werror=overlength-strings -Wno-vla -Werror=overflow -Wstrict-overflow -Werror=array-bounds -Werror=format-contains-nul -Werror=type-limits -fvisibility-inlines-hidden -fno-math-errno --param vect-max-version-for-alias-checks=50 -Xassembler --compress-debug-sections -felide-constructors -fmessage-length=0 -Wall -Wno-non-template-friend -Wno-long-long -Wreturn-type -Wunused -Wparentheses -Wno-deprecated -Werror=return-type -Werror=missing-braces -Werror=unused-value -Werror=address -Werror=format -Werror=sign-compare -Werror=write-strings -Werror=delete-non-virtual-dtor -Wstrict-aliasing -Werror=narrowing -Werror=unused-but-set-variable -Werror=reorder -Werror=unused-variable -Werror=conversion-null -Werror=return-local-addr -Wnon-virtual-dtor -Werror=switch -fdiagnostics-show-option -Wno-unused-local-typedefs -Wno-attributes -Wno-psabi
-  CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp
+  CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp-simd
 endif
 
 ifdef WITH_USOLIDS
diff --git a/Matriplex/MatriplexCommon.h b/Matriplex/MatriplexCommon.h
index 11bad035..4eff166d 100644
--- a/Matriplex/MatriplexCommon.h
+++ b/Matriplex/MatriplexCommon.h
@@ -37,14 +37,15 @@
     #define MUL(a, b)     _mm512_mul_ps(a, b)
     #define FMA(a, b, v)  _mm512_fmadd_ps(a, b, v)
 
-  #elif defined(__AVX2__)
+  #elif defined(__AVX2__) && defined(__FMA__)
 
     typedef __m256 IntrVec_t;
     #define MPLEX_INTRINSICS_WIDTH_BYTES  32
     #define MPLEX_INTRINSICS_WIDTH_BITS  256
     #define AVX2_INTRINSICS
     #define GATHER_INTRINSICS
-    #define GATHER_IDX_LOAD(name, arr)  __m256i name = _mm256_load_epi32(arr);
+    // Previously used _mm256_load_epi32(arr) here, but that's part of AVX-512F, not AVX2
+    #define GATHER_IDX_LOAD(name, arr)  __m256i name = _mm256_load_si256(reinterpret_cast<const __m256i *>(arr));
 
     #define LD(a, i)      _mm256_load_ps(&a[i*N+n])
     #define ST(a, i, r)   _mm256_store_ps(&a[i*N+n], r)
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index ffefc6ce..db1658c4 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -392,8 +392,8 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
 #if 0
     double time_input = dtime();
     int itrack;
-    omp_set_num_threads(Config::numThreadsReorg);
-#pragma omp parallel for
+    //omp_set_num_threads(Config::numThreadsReorg);
+//#pragma omp parallel for
     for (int i = beg; i < end; ++i) {
       itrack = i - beg;
       Track &trk = tracks[i];
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index 3dc26039..1ed9d5b9 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -16,7 +16,7 @@
 #include "index_selection_kernels.h"
 #include "best_hit_kernels.h"
 
-#include <omp.h>
+//#include <omp.h>
 #include <stdexcept>
 
 #define BLOCK_SIZE_X 256
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index b317b0cd..eb90ef10 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -69,15 +69,14 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
   int itrack = 0;
 
 // FIXME: uncomment when track building is ported to GPU.
-#if USE_CUDA_NOT_YET
 //#ifdef USE_CUDA
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
   // parallelize over Events.
-  omp_set_num_threads(Config::numThreadsReorg);
-#pragma omp parallel for private(itrack)
-#endif
+//  omp_set_num_threads(Config::numThreadsReorg);
+//#pragma omp parallel for private(itrack)
+//#endif
   for (int i = beg; i < end; ++i, ++itrack)
   {
     const Track &trk = tracks[i];
@@ -121,14 +120,13 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
 
   int itrack;
 //#ifdef USE_CUDA
-#if 0
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
   // parallelize over Events.
-  omp_set_num_threads(Config::numThreadsReorg);
-#pragma omp parallel for private(itrack)
-#endif
+//  omp_set_num_threads(Config::numThreadsReorg);
+//#pragma omp parallel for private(itrack)
+//#endif
   for (int i = beg; i < end; ++i) {
     itrack = i - beg;
     const Track &trk = tracks[i];
@@ -173,14 +171,13 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
   MatriplexTrackPacker mtp(tracks[beg]);
 
 //#ifdef USE_CUDA
-#if 0
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
   // parallelize over Events.
-  omp_set_num_threads(Config::numThreadsReorg);
-#pragma omp parallel for private(itrack)
-#endif
+//  omp_set_num_threads(Config::numThreadsReorg);
+//#pragma omp parallel for private(itrack)
+//#endif
   for (int i = beg; i < end; ++i)
   {
     int itrack = i - beg;
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index cb5b2ba7..7c81e226 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -12,7 +12,7 @@
 #if USE_CUDA
 #include "fittestMPlex.h"
 #include "FitterCU.h"
-#include <omp.h>
+//#include <omp.h>
 #endif
 
 #ifndef NO_ROOT
@@ -142,15 +142,18 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
 #endif
   separate_first_call_for_meaningful_profiling_numbers();
 
-  // Reorgnanization (copyIn) can eventually be multithreaded.
-  omp_set_nested(1);
+  // Reorganization (copyIn) can eventually be multithreaded.
+// FIXME: revisit multithreading when track building is ported to GPU.
+//  omp_set_nested(1);
       
-  omp_set_num_threads(Config::numThreadsEvents);
+//  omp_set_num_threads(Config::numThreadsEvents);
   double total_gpu_time = dtime();
-#pragma omp parallel reduction(+:s_tmp)
+//#pragma omp parallel reduction(+:s_tmp)
   {
-  int numThreadsEvents = omp_get_num_threads();
-  int thr_idx = omp_get_thread_num();
+//  int numThreadsEvents = omp_get_num_threads();
+//  int thr_idx = omp_get_thread_num();
+  int numThreadsEvents = 1;
+  int thr_idx = 0;
 
   // FitterCU is declared here to share allocations and deallocations
   // between the multiple events processed by a single thread.
@@ -177,11 +180,11 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
 #if 0  // 0 for timing, 1 for validation
       // Validation crashes for multiple threads.
       // It is something in relation to ROOT. Not sure what. 
-      if (omp_get_num_threads() <= 1) {
+      //if (omp_get_num_threads() <= 1) {
         //if (g_run_fit_std) {
           std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
         //}
-      }
+      //}
 #endif
     }
     cuFitter.free_extra_addBestHit();