diff --git a/CHANGELOG b/CHANGELOG index fa30ed54c..b5e46551d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ List of features / changes made / release notes, in reverse chronological order. If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). Master, using release name V 2.4.0 (1/7/25) +* PR618: removing alloca and using kernel dispatch on the GPU. * PR617: Caching pip dependencies in github actions. Forcing Ninja when building python on Windows. * PR614: Added support for sccache in github actions. diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m index 51aa4e4e1..5ef2f7343 100644 --- a/devel/gen_all_horner_C_code.m +++ b/devel/gen_all_horner_C_code.m @@ -16,8 +16,8 @@ ws = 2:16; opts.wpad = false; % pad kernel eval to multiple of 4 - if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w'); - else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w'); + if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w'); + else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w'); end fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); @@ -27,9 +27,9 @@ fprintf('w=%d\td=%d\tbeta=%.3g\n',w,d,beta); str = gen_ker_horner_loop_C_code(w,d,beta,opts); if j==1 % write switch statement - fwrite(fid,sprintf(' if (w==%d) {\n',w)); + fwrite(fid,sprintf(' if constexpr (w==%d) {\n',w)); else - fwrite(fid,sprintf(' } else if (w==%d) {\n',w)); + fwrite(fid,sprintf(' } else if constexpr (w==%d) {\n',w)); end for i=1:numel(str); fwrite(fid,[' ',str{i}]); end end diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index 1f4c59e2a..68d138f1d 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -1,205 +1,205 @@ // Code generated by gen_all_horner_C_code.m in finufft/devel // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) The Simons Foundation, Inc. - if (w==2) { - constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01}; - constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01}; - constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02}; - constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01}; + if constexpr (w==2) { + constexpr FLT c0[] = {5.5428559551548373E-01, 5.5428559551548395E-01}; + constexpr FLT c1[] = {7.0481840008800722E-01, -7.0481840008800745E-01}; + constexpr FLT c2[] = {-2.2584311526143652E-02, -2.2584311526143739E-02}; + constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954222E-01}; for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); - } else if (w==3) { - constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01}; - constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01}; - constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01}; - constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02}; - constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02}; - constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03}; + } else if constexpr (w==3) { + constexpr FLT c0[] = {1.7787237246937559E-01, 1.0000000000000009E+00, 1.7787237247678447E-01}; + constexpr FLT c1[] = {3.5966530797580953E-01, -1.4849044935138803E-16, -3.5966530796781021E-01}; + constexpr FLT c2[] = {2.0160576446392528E-01, -3.7666666666667314E-01, 2.0160576447145470E-01}; + constexpr FLT c3[] = {-1.7450587318669390E-02, -5.9740104982804636E-17, 1.7450587325767725E-02}; + constexpr FLT c4[] = {-4.2902993854032997E-02, 6.0475925925925503E-02, -4.2902993846219602E-02}; + constexpr FLT c5[] = {-4.5057857403455262E-03, -9.2753005973835603E-17, 4.5057857475245449E-03}; for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==4) { - constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02}; - constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01}; - constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01}; - constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02}; - constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03}; - constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03}; - constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04}; + } else if constexpr (w==4) { + constexpr FLT c0[] = {3.9828257752799370E-02, 7.3911656575585749E-01, 7.3911656575585760E-01, 3.9828257752799363E-02}; + constexpr FLT c1[] = {1.0749328817387323E-01, 4.5419700247912237E-01, -4.5419700247912254E-01, -1.0749328817387323E-01}; + constexpr FLT c2[] = {1.0408888748149286E-01, -1.0268333881994479E-01, -1.0268333881994483E-01, 1.0408888748149285E-01}; + constexpr FLT c3[] = {3.7516840869185775E-02, -1.0412335657155619E-01, 1.0412335657155625E-01, -3.7516840869185754E-02}; + constexpr FLT c4[] = {-3.5432868834529784E-03, 2.8903049344237144E-03, 2.8903049344236845E-03, -3.5432868834529762E-03}; + constexpr FLT c5[] = {-5.7512181801491037E-03, 1.0945950376831656E-02, -1.0945950376831742E-02, 5.7512181801491037E-03}; + constexpr FLT c6[] = {-7.3657365672907186E-04, 3.7144674885203870E-04, 3.7144674885190496E-04, -7.3657365672909203E-04}; for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==5) { - constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02}; - constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02}; - constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02}; - constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02}; - constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03}; - constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04}; - constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04}; - constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05}; + } else if constexpr (w==5) { + constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474285E-01, 1.0000000000000004E+00, 3.8286382489474230E-01, 1.0051451410391404E-02}; + constexpr FLT c1[] = {3.0826052021380428E-02, 3.8431958613457951E-01, -1.6970337068730065E-16, -3.8431958613457923E-01, -3.0826052021380418E-02}; + constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392836E-02, -2.3000000000000087E-01, 7.8509612097392947E-02, 3.6562231959204314E-02}; + constexpr FLT c3[] = {2.0250135419918273E-02, -3.9381037339048630E-02, 4.5265089812433205E-17, 3.9381037339048575E-02, -2.0250135419918266E-02}; + constexpr FLT c4[] = {4.0593041193018563E-03, -1.6067481167759606E-02, 2.4150000000000001E-02, -1.6067481167759586E-02, 4.0593041193018580E-03}; + constexpr FLT c5[] = {-9.2488937959281099E-04, 1.2476700479674141E-03, -1.1417887700143805E-16, -1.2476700479675457E-03, 9.2488937959281511E-04}; + constexpr FLT c6[] = {-5.6059657038176559E-04, 1.2116190166775015E-03, -1.5448333333333995E-03, 1.2116190166774968E-03, -5.6059657038176679E-04}; + constexpr FLT c7[] = {-3.4201716508558133E-05, 2.3137115416426317E-05, -1.6211468115972233E-17, -2.3137115416398588E-05, 3.4201716508565797E-05}; for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==6) { - constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03}; - constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03}; - constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02}; - constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03}; - constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03}; - constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04}; - constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04}; - constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05}; - constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06}; + } else if constexpr (w==6) { + constexpr FLT c0[] = {2.0875119883113444E-03, 1.5741818314646622E-01, 8.2446837122968719E-01, 8.2446837122968786E-01, 1.5741818314646619E-01, 2.0875119883208707E-03}; + constexpr FLT c1[] = {7.2383827471879060E-03, 2.0903648995439425E-01, 3.2052935784357611E-01, -3.2052935784357595E-01, -2.0903648995439431E-01, -7.2383827471776199E-03}; + constexpr FLT c2[] = {1.0180085126333447E-02, 9.2337811484268992E-02, -1.0253741712233815E-01, -1.0253741712233826E-01, 9.2337811484268992E-02, 1.0180085126343148E-02}; + constexpr FLT c3[] = {7.3669955501269495E-03, 4.9102900025223368E-03, -5.1302324979469585E-02, 5.1302324979469453E-02, -4.9102900025223412E-03, -7.3669955501178205E-03}; + constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696545064E-03, 5.2920367975572286E-03, 5.2920367975572606E-03, -8.0004810696545012E-03, 2.7444270008144416E-03}; + constexpr FLT c5[] = {3.2622379114949579E-04, -1.8514138516535774E-03, 3.8520985619444484E-03, -3.8520985619445560E-03, 1.8514138516535451E-03, -3.2622379114026316E-04}; + constexpr FLT c6[] = {-1.2239646122606804E-04, 2.2750660293439196E-04, -1.2702072030321002E-04, -1.2702072030313421E-04, 2.2750660293438364E-04, -1.2239646121695212E-04}; + constexpr FLT c7[] = {-4.6695893922773253E-05, 1.1717219021522327E-04, -1.8098268625867290E-04, 1.8098268625856663E-04, -1.1717219021521502E-04, 4.6695893931712934E-05}; + constexpr FLT c8[] = {-1.5875418082722892E-06, 7.2147850129502829E-07, -7.0930078290747281E-08, -7.0930078325137639E-08, 7.2147850125572373E-07, -1.5875417996328233E-06}; for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==7) { - constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04}; - constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03}; - constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03}; - constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03}; - constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03}; - constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04}; - constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05}; - constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05}; - constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06}; - constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08}; + } else if constexpr (w==7) { + constexpr FLT c0[] = {4.0677823488318089E-04, 5.5714997521829533E-02, 5.1113018541287780E-01, 9.9999999999999967E-01, 5.1113018541287847E-01, 5.5714997521829547E-02, 4.0677823488475867E-04}; + constexpr FLT c1[] = {1.5569364307494542E-03, 8.9228372765633918E-02, 3.5049603091348147E-01, -1.4849044935138815E-16, -3.5049603091348169E-01, -8.9228372765633959E-02, -1.5569364307477609E-03}; + constexpr FLT c2[] = {2.4904843753404821E-03, 5.4888936725282347E-02, 2.4759577399513327E-02, -1.6428571428571428E-01, 2.4759577399513240E-02, 5.4888936725282347E-02, 2.4904843753420971E-03}; + constexpr FLT c3[] = {2.1552691780265237E-03, 1.3627105791872417E-02, -3.3718114813591105E-02, 6.0458598471891237E-18, 3.3718114813591230E-02, -1.3627105791872405E-02, -2.1552691780250205E-03}; + constexpr FLT c4[] = {1.0735311014902866E-03, -7.2030895675484409E-04, -6.6760503000564131E-03, 1.2656705539358644E-02, -6.6760503000563957E-03, -7.2030895675484919E-04, 1.0735311014919515E-03}; + constexpr FLT c5[] = {2.8413019973530577E-04, -1.1175797418592726E-03, 1.3906361031250946E-03, -1.1555014499291590E-16, -1.3906361031251887E-03, 1.1175797418592557E-03, -2.8413019973377786E-04}; + constexpr FLT c6[] = {1.6363160465887915E-05, -1.5802085209243292E-04, 4.4431051893376538E-04, -6.0985626028865671E-04, 4.4431051893374798E-04, -1.5802085209243606E-04, 1.6363160467395125E-05}; + constexpr FLT c7[] = {-1.2513684117291576E-05, 2.9105578584782810E-05, -2.8835295309289061E-05, 4.4580005943244268E-18, 2.8835295309367696E-05, -2.9105578584777121E-05, 1.2513684118771265E-05}; + constexpr FLT c8[] = {-3.2859430043343644E-06, 9.3570096164309209E-06, -1.7015821249889751E-05, 2.0688046128666583E-05, -1.7015821249932014E-05, 9.3570096164071294E-06, -3.2859430029065286E-06}; + constexpr FLT c9[] = {-1.5030958478324257E-08, -9.3540219420629153E-08, 1.3079704870578265E-07, -9.4759238230668171E-18, -1.3079704867810334E-07, 9.3540219423397084E-08, 1.5030959705595645E-08}; for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==8) { - constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05}; - constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04}; - constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04}; - constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04}; - constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04}; - constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04}; - constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05}; - constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08}; - constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06}; - constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07}; + } else if constexpr (w==8) { + constexpr FLT c0[] = {7.5442178667263981E-05, 1.7659090182402842E-02, 2.6112828482312633E-01, 8.6561421087578250E-01, 8.6561421087578250E-01, 2.6112828482312633E-01, 1.7659090182402845E-02, 7.5442178667263791E-05}; + constexpr FLT c1[] = {3.1361556564941506E-04, 3.2518751351035623E-02, 2.4295266212395927E-01, 2.5083142126627173E-01, -2.5083142126627195E-01, -2.4295266212395936E-01, -3.2518751351035616E-02, -3.1361556564941478E-04}; + constexpr FLT c2[] = {5.5627094085228138E-04, 2.4604803324737464E-02, 6.5902977410162808E-02, -9.1064379250067620E-02, -9.1064379250067551E-02, 6.5902977410162836E-02, 2.4604803324737454E-02, 5.5627094085228138E-04}; + constexpr FLT c3[] = {5.5053208919074751E-04, 9.2359485489686994E-03, -6.2169545154250015E-03, -3.1386277864020519E-02, 3.1386277864020574E-02, 6.2169545154250197E-03, -9.2359485489686959E-03, -5.5053208919074741E-04}; + constexpr FLT c4[] = {3.3122072653963842E-04, 1.3353118718124391E-03, -5.9878504390517145E-03, 4.3217905833727848E-03, 4.3217905833727709E-03, -5.9878504390517232E-03, 1.3353118718124370E-03, 3.3122072653963820E-04}; + constexpr FLT c5[] = {1.2112223749399384E-04, -2.3174709024354235E-04, -5.1773322458170408E-04, 1.8691284471381068E-03, -1.8691284471383178E-03, 5.1773322458165399E-04, 2.3174709024354165E-04, -1.2112223749399399E-04}; + constexpr FLT c6[] = {2.3288943339077732E-05, -1.1810885265513068E-04, 2.1380000655375780E-04, -1.1905274322674535E-04, -1.1905274322671518E-04, 2.1380000655376477E-04, -1.1810885265513266E-04, 2.3288943339077745E-05}; + constexpr FLT c7[] = {8.7290223704851186E-08, -9.9551635569409828E-06, 3.9042123573726220E-05, -7.0647330846703837E-05, 7.0647330846726171E-05, -3.9042123573714680E-05, 9.9551635569465072E-06, -8.7290223704776620E-08}; + constexpr FLT c8[] = {-1.0444417486659137E-06, 2.8837147790388339E-06, -3.9445588398419149E-06, 1.9505656879770040E-06, 1.9505656878898172E-06, -3.9445588398656149E-06, 2.8837147790302746E-06, -1.0444417486661539E-06}; + constexpr FLT c9[] = {-1.9601350641696516E-07, 6.2981383505557520E-07, -1.3252363385038412E-06, 1.9071649677086494E-06, -1.9071649677307929E-06, 1.3252363384816978E-06, -6.2981383505661323E-07, 1.9601350641687865E-07}; for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==9) { - constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05}; - constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05}; - constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04}; - constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04}; - constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05}; - constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05}; - constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05}; - constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06}; - constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08}; - constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08}; - constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08}; + } else if constexpr (w==9) { + constexpr FLT c0[] = {1.3445576990655699E-05, 5.1377966678943518E-03, 1.1569392196071665E-01, 5.9595989228910662E-01, 1.0000000000000004E+00, 5.9595989228910773E-01, 1.1569392196071666E-01, 5.1377966678943813E-03, 1.3445576990655628E-05}; + constexpr FLT c1[] = {6.0003223623206576E-05, 1.0569385595664981E-02, 1.3202059711663514E-01, 3.1241329121161548E-01, -8.4851685343650311E-17, -3.1241329121161582E-01, -1.3202059711663514E-01, -1.0569385595665021E-02, -6.0003223623206590E-05}; + constexpr FLT c2[] = {1.1601811379064814E-04, 9.2861699099147117E-03, 5.4760895870332289E-02, -2.7420112488906883E-04, -1.2777777777777805E-01, -2.7420112488927613E-04, 5.4760895870332275E-02, 9.2861699099147394E-03, 1.1601811379064824E-04}; + constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762531E-03, -2.6627297241817630E-02, -3.2098550589665862E-17, 2.6627297241817914E-02, -6.4505427512762349E-03, -4.4048543606096903E-03, -1.2783089927061688E-04}; + constexpr FLT c4[] = {8.8459828362140113E-05, 1.1147546008569548E-03, -2.1200589329645964E-03, -2.9677441441082913E-03, 7.7692043895745072E-03, -2.9677441441080905E-03, -2.1200589329645829E-03, 1.1147546008569568E-03, 8.8459828362140100E-05}; + constexpr FLT c5[] = {3.9567294647305411E-05, 8.1817980646544931E-05, -7.2116754318333879E-04, 1.0390038161995874E-03, -1.0283042279864902E-16, -1.0390038161999537E-03, 7.2116754318329336E-04, -8.1817980646548062E-05, -3.9567294647305451E-05}; + constexpr FLT c6[] = {1.1032857092605819E-05, -3.4254477931957174E-05, -1.3557143976059904E-05, 1.8667778536558257E-04, -2.9974999576623664E-04, 1.8667778536542303E-04, -1.3557143976058785E-05, -3.4254477931959797E-05, 1.1032857092605870E-05}; + constexpr FLT c7[] = {1.5345430093718074E-06, -9.9308189188252872E-06, 2.3762810604656844E-05, -2.4017602201909826E-05, -5.2670367292513955E-17, 2.4017602202069285E-05, -2.3762810604655319E-05, 9.9308189188297425E-06, -1.5345430093717854E-06}; + constexpr FLT c8[] = {-8.1737159283252536E-08, -4.1540916378119670E-07, 2.6668107554329568E-06, -6.3261434127473353E-06, 8.2578681448991719E-06, -6.3261434126871112E-06, 2.6668107554144911E-06, -4.1540916378835333E-07, -8.1737159283297297E-08}; + constexpr FLT c9[] = {-7.3256982980638094E-08, 2.3321978963637828E-07, -4.0030411107409711E-07, 3.4388260963072595E-07, -5.0485268896210294E-17, -3.4388260995180581E-07, 4.0030411105610554E-07, -2.3321978963516732E-07, 7.3256982980616468E-08}; + constexpr FLT c10[] = {-1.0121400696640600E-08, 3.6191328860529519E-08, -8.7258577122732534E-08, 1.4622014475704781E-07, -1.7333902188617528E-07, 1.4622014472849038E-07, -8.7258577131531566E-08, 3.6191328857229922E-08, -1.0121400696612377E-08}; for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==10) { - constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06}; - constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05}; - constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05}; - constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05}; - constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05}; - constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05}; - constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06}; - constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07}; - constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08}; - constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08}; - constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09}; - constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10}; + } else if constexpr (w==10) { + constexpr FLT c0[] = {2.3186292807626317E-06, 1.3952040327729876E-03, 4.5894237568906815E-02, 3.4666431215091620E-01, 8.9110862394331991E-01, 8.9110862394331991E-01, 3.4666431215091603E-01, 4.5894237568906829E-02, 1.3952040327729781E-03, 2.3186292807626118E-06}; + constexpr FLT c1[] = {1.1010978063160384E-05, 3.1454190365985996E-03, 6.0943215953720244E-02, 2.5074802988370293E-01, 2.0598750885032677E-01, -2.0598750885032696E-01, -2.5074802988370298E-01, -6.0943215953720258E-02, -3.1454190365985870E-03, -1.1010978063160370E-05}; + constexpr FLT c2[] = {2.2925449299630712E-05, 3.1050615653861963E-03, 3.2337657329423480E-02, 4.4760550762170462E-02, -8.0226193254406539E-02, -8.0226193254406331E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861863E-03, 2.2925449299630688E-05}; + constexpr FLT c3[] = {2.7622345748507543E-05, 1.7317590416004976E-03, 7.6620063086756561E-03, -9.8393115612840799E-03, -2.1163068654269191E-02, 2.1163068654269430E-02, 9.8393115612840625E-03, -7.6620063086756578E-03, -1.7317590416004915E-03, -2.7622345748507465E-05}; + constexpr FLT c4[] = {2.1363614860997120E-05, 5.7553475552091660E-04, 1.4813144535930209E-04, -4.1113061120762115E-03, 3.3662735809589788E-03, 3.3662735809589744E-03, -4.1113061120763477E-03, 1.4813144535929786E-04, 5.7553475552091303E-04, 2.1363614860997073E-05}; + constexpr FLT c5[] = {1.1063475580065288E-05, 1.0180053030149580E-04, -3.4137441280839747E-04, -4.9828659222695235E-05, 1.0442648308815685E-03, -1.0442648308818987E-03, 4.9828659222757584E-05, 3.4137441280838202E-04, -1.0180053030149428E-04, -1.1063475580065274E-05}; + constexpr FLT c6[] = {3.8359011440648767E-06, 1.3049698816909941E-06, -6.3791463619226925E-05, 1.4528730872072232E-04, -8.6630472952467041E-05, -8.6630472952443704E-05, 1.4528730872072487E-04, -6.3791463619226898E-05, 1.3049698816893206E-06, 3.8359011440648818E-06}; + constexpr FLT c7[] = {8.3366418668164029E-07, -3.5785601754608600E-06, 2.4539930904838721E-06, 1.2754336575774621E-05, -3.3000414536145769E-05, 3.3000414536255456E-05, -1.2754336575763307E-05, -2.4539930904908122E-06, 3.5785601754624753E-06, -8.3366418668162441E-07}; + constexpr FLT c8[] = {8.0572098823818831E-08, -6.8352224328304655E-07, 2.0695541423448143E-06, -2.9709579576719820E-06, 1.5005770225068746E-06, 1.5005770225918255E-06, -2.9709579578749849E-06, 2.0695541423337669E-06, -6.8352224328480012E-07, 8.0572098823799839E-08}; + constexpr FLT c9[] = {-1.0412910456853713E-08, -3.6228831479630466E-09, 1.3932530224464304E-07, -4.5071262431676362E-07, 7.5149884418071783E-07, -7.5149884434956146E-07, 4.5071262441364116E-07, -1.3932530225433080E-07, 3.6228831480495445E-09, 1.0412910456864524E-08}; + constexpr FLT c10[] = {-4.4291858217039330E-09, 1.5904364892603811E-08, -3.2603275117030568E-08, 3.8190045595613436E-08, -1.7631718285886992E-08, -1.7631718414100877E-08, 3.8190045567330887E-08, -3.2603275115773551E-08, 1.5904364892603811E-08, -4.4291858217086909E-09}; + constexpr FLT c11[] = {-4.4040059170691262E-10, 1.7857872831924088E-09, -4.9203237607399544E-09, 9.5125262237319707E-09, -1.3157194794825051E-08, 1.3157194847067723E-08, -9.5125262416193676E-09, 4.9203237493828085E-09, -1.7857872837247548E-09, 4.4040059171467111E-10}; for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==11) { - constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07}; - constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06}; - constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06}; - constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06}; - constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06}; - constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06}; - constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06}; - constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07}; - constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08}; - constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09}; - constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10}; - constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10}; - constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11}; + } else if constexpr (w==11) { + constexpr FLT c0[] = {3.8884809238313450E-07, 3.5785567372179962E-04, 1.6654951019551327E-02, 1.7692785324424565E-01, 6.5593328211813118E-01, 9.9999999999999967E-01, 6.5593328211813084E-01, 1.7692785324424556E-01, 1.6654951019551324E-02, 3.5785567372179918E-04, 3.8884809238312270E-07}; + constexpr FLT c1[] = {1.9516358260453343E-06, 8.7214421096705550E-04, 2.4929466432368069E-02, 1.5885079249667169E-01, 2.7894884556454913E-01, -8.4851685343650385E-17, -2.7894884556454930E-01, -1.5885079249667178E-01, -2.4929466432368076E-02, -8.7214421096705539E-04, -1.9516358260453157E-06}; + constexpr FLT c2[] = {4.3353827605930486E-06, 9.4705645354715496E-04, 1.5700144896729010E-02, 4.8428271550326765E-02, -1.2807080799297233E-02, -1.0454545454545465E-01, -1.2807080799297139E-02, 4.8428271550326821E-02, 1.5700144896729003E-02, 9.4705645354715550E-04, 4.3353827605930232E-06}; + constexpr FLT c3[] = {5.6395387871289838E-06, 5.9760549110825483E-04, 5.0911332059142269E-03, 1.6690038662948178E-03, -2.1030028251698023E-02, 4.0648251383670637E-17, 2.1030028251698085E-02, -1.6690038662948057E-03, -5.0911332059142252E-03, -5.9760549110825462E-04, -5.6395387871289525E-06}; + constexpr FLT c4[] = {4.7836299264887174E-06, 2.3732554180006424E-04, 7.1846854433598318E-04, -2.2660086673713543E-03, -1.3190061226035939E-03, 5.2488730277987635E-03, -1.3190061226035364E-03, -2.2660086673713794E-03, 7.1846854433598557E-04, 2.3732554180006394E-04, 4.7836299264886929E-06}; + constexpr FLT c5[] = {2.7801202330030043E-06, 5.8401836435975921E-05, -5.7255962675864676E-05, -4.1058481683298675E-04, 7.4543249761816952E-04, -1.1824261682570684E-17, -7.4543249761830374E-04, 4.1058481683291709E-04, 5.7255962675855183E-05, -5.8401836435976171E-05, -2.7801202330029916E-06}; + constexpr FLT c6[] = {1.1248609988572028E-06, 7.1593996360417151E-06, -3.7923443960744513E-05, 2.8219312687366578E-05, 8.5797383067800522E-05, -1.6875309167112642E-04, 8.5797383067676258E-05, 2.8219312687366561E-05, -3.7923443960745272E-05, 7.1593996360417574E-06, 1.1248609988571984E-06}; + constexpr FLT c7[] = {3.1074712008817415E-07, -3.7942806006667198E-07, -4.2327710785702258E-06, 1.4518421536658761E-05, -1.6373413879628303E-05, -3.1234901536718130E-17, 1.6373413879560473E-05, -1.4518421536624180E-05, 4.2327710785729931E-06, 3.7942806006691651E-07, -3.1074712008817188E-07}; + constexpr FLT c8[] = {5.3160526822192677E-08, -2.9438470061316177E-07, 4.4816653818369536E-07, 4.9835853876343515E-07, -2.6602444110742270E-06, 3.9090815374501741E-06, -2.6602444111213995E-06, 4.9835853870698559E-07, 4.4816653817806582E-07, -2.9438470061343044E-07, 5.3160526822190930E-08}; + constexpr FLT c9[] = {3.1778958300839193E-09, -3.9044067083716177E-08, 1.4726158787950358E-07, -2.7451209289276638E-07, 2.4544112216615996E-07, 8.3873022606577459E-18, -2.4544112211633722E-07, 2.7451209284571158E-07, -1.4726158788400146E-07, 3.9044067083597242E-08, -3.1778958300827369E-09}; + constexpr FLT c10[] = {-8.6163117991801527E-10, 1.2292710052700590E-09, 4.9928263017863946E-09, -2.5746199379526484E-08, 5.5054682142514025E-08, -6.9606951492905721E-08, 5.5054682071650699E-08, -2.5746199414722506E-08, 4.9928263008436627E-09, 1.2292710051718510E-09, -8.6163117991870545E-10}; + constexpr FLT c11[] = {-2.3293080872740411E-10, 9.3461130398706385E-10, -2.2220140843091046E-09, 3.2420144489205448E-09, -2.5573586346170970E-09, -3.1552257230018361E-17, 2.5573585789673351E-09, -3.2420144222313730E-09, 2.2220140838832191E-09, -9.3461130385397131E-10, 2.3293080872969065E-10}; + constexpr FLT c12[] = {-1.6776727230706402E-11, 7.5440974197914497E-11, -2.3911386243337562E-10, 5.3207181502105922E-10, -8.5057637343194789E-10, 9.9272871182573856E-10, -8.5057647960094213E-10, 5.3207179358287845E-10, -2.3911386766517342E-10, 7.5440974038380308E-11, -1.6776727232199783E-11}; for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==12) { - constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08}; - constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07}; - constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07}; - constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06}; - constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06}; - constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07}; - constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07}; - constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08}; - constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08}; - constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09}; - constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11}; - constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11}; - constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11}; - constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13}; + } else if constexpr (w==12) { + constexpr FLT c0[] = {6.3667715563015875E-08, 8.7461142088576875E-05, 5.6146669497086563E-03, 8.1271316412301356E-02, 4.1627261402765720E-01, 9.0846375182673667E-01, 9.0846375182673689E-01, 4.1627261402765714E-01, 8.1271316412301522E-02, 5.6146669497086675E-03, 8.7461142088576766E-05, 6.3667715563034443E-08}; + constexpr FLT c1[] = {3.3587389488258561E-07, 2.2809471090022882E-04, 9.2744480587561903E-03, 8.5676487647659963E-02, 2.4720659158040598E-01, 1.7472997738461984E-01, -1.7472997738461982E-01, -2.4720659158040606E-01, -8.5676487647660074E-02, -9.2744480587562093E-03, -2.2809471090022882E-04, -3.3587389488256608E-07}; + constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214729E-04, 6.6557324960729129E-03, 3.4792641812076697E-02, 2.9454899103693716E-02, -7.1172529707069263E-02, -7.1172529707069318E-02, 2.9454899103693668E-02, 3.4792641812076690E-02, 6.6557324960729259E-03, 2.6846594761214735E-04, 7.9035220764956928E-07}; + constexpr FLT c3[] = {1.0993606197695963E-06, 1.8716155179384050E-04, 2.6329045000561373E-03, 5.3754303637599983E-03, -1.0591878410592564E-02, -1.5228395084945693E-02, 1.5228395084945721E-02, 1.0591878410592634E-02, -5.3754303637599515E-03, -2.6329045000561373E-03, -1.8716155179384044E-04, -1.0993606197695834E-06}; + constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468423E-05, 5.7431140218944417E-04, -5.0274672420765716E-04, -2.8008958990918590E-03, 2.6435090762444630E-03, 2.6435090762444709E-03, -2.8008958990918750E-03, -5.0274672420768665E-04, 5.7431140218944243E-04, 8.4812954286468410E-05, 1.0091198513153596E-06}; + constexpr FLT c5[] = {6.4507244019416542E-07, 2.5481132674301171E-05, 4.2795619387508642E-05, -3.0197159708156746E-04, 1.1080610219034021E-04, 6.4144454802687542E-04, -6.4144454802693245E-04, -1.1080610219044532E-04, 3.0197159708159055E-04, -4.2795619387510356E-05, -2.5481132674301232E-05, -6.4507244019414964E-07}; + constexpr FLT c6[] = {2.9426545129495870E-07, 4.7724106401924017E-06, -1.1001642128368620E-05, -2.6869692251298703E-05, 9.4483235217664095E-05, -6.1678458203335938E-05, -6.1678458203393943E-05, 9.4483235217639687E-05, -2.6869692251325618E-05, -1.1001642128368075E-05, 4.7724106401924466E-06, 2.9426545129497887E-07}; + constexpr FLT c7[] = {9.5799843879057143E-08, 3.7784160107128570E-07, -3.2256313018464523E-06, 5.0144058082807615E-06, 3.4886031174315520E-06, -1.7411974954248305E-05, 1.7411974954164805E-05, -3.4886031173927896E-06, -5.0144058082569217E-06, 3.2256313018479033E-06, -3.7784160107127585E-07, -9.5799843879039315E-08}; + constexpr FLT c8[] = {2.1473864761677977E-08, -5.7414008446866310E-08, -2.0134799316349550E-07, 1.1145247706237484E-06, -1.8840465965944781E-06, 1.0067804561525776E-06, 1.0067804560939612E-06, -1.8840465966553108E-06, 1.1145247706046697E-06, -2.0134799316649463E-07, -5.7414008446904168E-08, 2.1473864761695606E-08}; + constexpr FLT c9[] = {2.8867786924317360E-09, -2.0015791402122435E-08, 4.5306507658961618E-08, -7.8859059649942724E-09, -1.5755151476976810E-07, 3.4270221890477367E-07, -3.4270221894629257E-07, 1.5755151475039259E-07, 7.8859059622263425E-09, -4.5306507657577652E-08, 2.0015791402096756E-08, -2.8867786924175027E-09}; + constexpr FLT c10[] = {6.9986758891768954E-11, -1.8486004429312075E-09, 8.7658205616141919E-09, -2.0364661377054439E-08, 2.5396405363839655E-08, -1.2044441255258105E-08, -1.2044441287940056E-08, 2.5396405319216143E-08, -2.0364661357570944E-08, 8.7658205583145718E-09, -1.8486004429312099E-09, 6.9986758906969690E-11}; + constexpr FLT c11[] = {-5.6296594747769723E-11, 1.4066781270841427E-10, 4.6947621930829374E-11, -1.1526063712774952E-09, 3.3027593717047561E-09, -5.2174001478471531E-09, 5.2174000780009304E-09, -3.3027593677297393E-09, 1.1526063475695102E-09, -4.6947618807614674E-11, -1.4066781270398007E-10, 5.6296594761321533E-11}; + constexpr FLT c12[] = {-1.0870401167973640E-11, 4.8044744348002116E-11, -1.3004175664400734E-10, 2.2570502777621933E-10, -2.4006681200298005E-10, 1.0598000539497877E-10, 1.0597996456091133E-10, -2.4006685896251565E-10, 2.2570503237007147E-10, -1.3004176149304966E-10, 4.8044744298142519E-11, -1.0870401156024175E-11}; + constexpr FLT c13[] = {-4.7539080513361941E-13, 2.6787995685622948E-12, -1.0000146360885640E-11, 2.5777397611211484E-11, -4.7463678850231190E-11, 6.4012226087607112E-11, -6.4012244598782536E-11, 4.7463686292174809E-11, -2.5777393714330728E-11, 1.0000149173124785E-11, -2.6787995668897570E-12, 4.7539081135519811E-13}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else if (w==13) { - constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08}; - constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08}; - constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07}; - constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07}; - constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07}; - constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07}; - constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08}; - constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08}; - constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09}; - constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09}; - constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10}; - constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12}; - constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12}; - constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13}; - constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14}; + } else if constexpr (w==13) { + constexpr FLT c0[] = {1.0208956054983739E-08, 2.0506572462261991E-05, 1.7784497194617899E-03, 3.4214490279692991E-02, 2.3443634373410030E-01, 7.0049708882252748E-01, 9.9999999999999911E-01, 7.0049708882252648E-01, 2.3443634373410030E-01, 3.4214490279692901E-02, 1.7784497194617893E-03, 2.0506572462261737E-05, 1.0208956054983552E-08}; + constexpr FLT c1[] = {5.6353468219321909E-08, 5.6780128053894652E-05, 3.1934841481628300E-03, 4.0941461360716865E-02, 1.7436810648693329E-01, 2.5085467225681662E-01, -1.9091629202321336E-16, -2.5085467225681657E-01, -1.7436810648693327E-01, -4.0941461360716788E-02, -3.1934841481628291E-03, -5.6780128053894164E-05, -5.6353468219321955E-08}; + constexpr FLT c2[] = {1.3966266158866417E-07, 7.1655019336418701E-05, 2.5459504018621160E-03, 2.0160236969440630E-02, 4.0770064165298388E-02, -1.9317276988534474E-02, -8.8461538461538772E-02, -1.9317276988534395E-02, 4.0770064165298388E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418227E-05, 1.3966266158866427E-07}; + constexpr FLT c3[] = {2.0618605552701914E-07, 5.4306747658367697E-05, 1.1637911071900929E-03, 4.7784706844645362E-03, -1.2004184173789185E-03, -1.6862510515566094E-02, 7.6857235047377246E-17, 1.6862510515566118E-02, 1.2004184173788478E-03, -4.7784706844645414E-03, -1.1637911071900923E-03, -5.4306747658367358E-05, -2.0618605552701911E-07}; + constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415500E-05, 3.2236608098850278E-04, 3.0859705461355795E-04, -2.0254394973525615E-03, -5.2398574644568101E-04, 3.7818616294948075E-03, -5.2398574644549009E-04, -2.0254394973524856E-03, 3.0859705461356413E-04, 3.2236608098850262E-04, 2.7328509487415354E-05, 2.0277547837406105E-07}; + constexpr FLT c5[] = {1.4058372037094495E-07, 9.4685595066535949E-06, 4.8682874512157492E-05, -1.1575111217136869E-04, -2.1811605515770685E-04, 5.4056763477026894E-04, -1.3413321890920493E-16, -5.4056763477036002E-04, 2.1811605515769839E-04, 1.1575111217134967E-04, -4.8682874512158421E-05, -9.4685595066535746E-06, -1.4058372037094500E-07}; + constexpr FLT c6[] = {7.0755520230584346E-08, 2.2298625886400052E-06, 7.8375383351888153E-07, -2.8394470622678048E-05, 3.5771256766222007E-05, 4.1631950912162111E-05, -1.0418619302475171E-04, 4.1631950912266106E-05, 3.5771256766155742E-05, -2.8394470622679332E-05, 7.8375383351906321E-07, 2.2298625886400366E-06, 7.0755520230584399E-08}; + constexpr FLT c7[] = {2.6111186487625288E-08, 3.2044561720738900E-07, -1.2220373462315751E-06, -8.5793794341938345E-07, 8.3299507234159490E-06, -1.0956754351193869E-05, -4.2133273481770268E-18, 1.0956754351102354E-05, -8.3299507234444127E-06, 8.5793794341977245E-07, 1.2220373462317335E-06, -3.2044561720742071E-07, -2.6111186487625245E-08}; + constexpr FLT c8[] = {6.9838095920570697E-09, 1.2796250155229052E-08, -2.1971713837869602E-07, 5.2791981730556338E-07, -1.4622692106232861E-07, -1.2222183756484325E-06, 2.0809248311023210E-06, -1.2222183757873931E-06, -1.4622692102353826E-07, 5.2791981729316091E-07, -2.1971713837932606E-07, 1.2796250155256654E-08, 6.9838095920570556E-09}; + constexpr FLT c9[] = {1.2845897306280754E-09, -5.2304801922932016E-09, -5.0548716988230519E-09, 6.7539942919701739E-08, -1.6027276236470508E-07, 1.5655092162587644E-07, -2.2699174111659530E-17, -1.5655092177811256E-07, 1.6027276234809751E-07, -6.7539942915549835E-08, 5.0548716980878199E-09, 5.2304801922361841E-09, -1.2845897306281492E-09}; + constexpr FLT c10[] = {1.3345700642130691E-10, -1.1551704392484999E-09, 3.4412362338210342E-09, -3.2850871134619081E-09, -6.1855159001258178E-09, 2.3119925528230141E-08, -3.2145944300982594E-08, 2.3119925902816086E-08, -6.1855159856018208E-09, -3.2850871404873946E-09, 3.4412362337817518E-09, -1.1551704392006267E-09, 1.3345700642135499E-10}; + constexpr FLT c11[] = {-1.9694481417431522E-12, -7.0630732014279091E-11, 4.4161967795289093E-10, -1.2581280884757657E-09, 2.0087583501084213E-09, -1.6557203641746698E-09, -1.8616541410779625E-17, 1.6557200876291340E-09, -2.0087583293639620E-09, 1.2581281110480127E-09, -4.4161967829005099E-10, 7.0630731978786787E-11, 1.9694481417649691E-12}; + constexpr FLT c12[] = {-3.1122514900790565E-12, 1.0235548901329534E-11, -1.0076717372694741E-11, -3.6278868512857559E-11, 1.6235814346711627E-10, -3.2356761019059813E-10, 4.0014579978425756E-10, -3.2356781027851721E-10, 1.6235814857129835E-10, -3.6278894672280772E-11, -1.0076717867175658E-11, 1.0235548921264714E-11, -3.1122514901015834E-12}; + constexpr FLT c13[] = {-4.4521627553552838E-13, 2.1830423143884514E-12, -6.6494703003945180E-12, 1.3364546946965736E-11, -1.7572531722265358E-11, 1.3087516537835037E-11, -7.1887842145401140E-18, -1.3087607686672063E-11, 1.7572520191874396E-11, -1.3364550808652903E-11, 6.6494701329325954E-12, -2.1830423491074722E-12, 4.4521627550943629E-13}; + constexpr FLT c14[] = {-1.1331825591755105E-14, 7.5442529734273508E-14, -3.5473139032624521E-13, 1.0827877935901268E-12, -2.3054249530059664E-12, 3.5752188140828952E-12, -4.1288740135932172E-12, 3.5754451352466983E-12, -2.3054565626769554E-12, 1.0827765983400355E-12, -3.5473149190201202E-13, 7.5442562556957494E-14, -1.1331825576920660E-14}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); - } else if (w==14) { - constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09}; - constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09}; - constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08}; - constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08}; - constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08}; - constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08}; - constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08}; - constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09}; - constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09}; - constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10}; - constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11}; - constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12}; - constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13}; - constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13}; - constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14}; - constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16}; + } else if constexpr (w==14) { + constexpr FLT c0[] = {1.6070755785071679E-09, 4.6371263117318300E-06, 5.3392892770691446E-04, 1.3380163586766324E-02, 1.1960061568997654E-01, 4.7332499268789291E-01, 9.2104360429933807E-01, 9.2104360429933807E-01, 4.7332499268789280E-01, 1.1960061568997681E-01, 1.3380163586766324E-02, 5.3392892770691772E-04, 4.6371263117318300E-06, 1.6070755785075471E-09}; + constexpr FLT c1[] = {9.2475302076758724E-09, 1.3546865389183950E-05, 1.0306349751547574E-03, 1.7767594411827743E-02, 1.0518000824290007E-01, 2.3882936521395379E-01, 1.5170179567585820E-01, -1.5170179567585820E-01, -2.3882936521395384E-01, -1.0518000824290029E-01, -1.7767594411827743E-02, -1.0306349751547609E-03, -1.3546865389183968E-05, -9.2475302076757831E-09}; + constexpr FLT c2[] = {2.4024402573674970E-08, 1.8178651135370006E-05, 8.9712289901830574E-04, 1.0104692380253475E-02, 3.4193348251104448E-02, 1.8533380680638787E-02, -6.3746746886473971E-02, -6.3746746886473984E-02, 1.8533380680638711E-02, 3.4193348251104379E-02, 1.0104692380253475E-02, 8.9712289901830921E-04, 1.8178651135370046E-05, 2.4024402573675774E-08}; + constexpr FLT c3[] = {3.7419288907183508E-08, 1.4804264337309617E-05, 4.5929141335173139E-04, 3.0552592910038151E-03, 3.3079403387824180E-03, -1.0247716289024938E-02, -1.1480323948535189E-02, 1.1480323948535308E-02, 1.0247716289024964E-02, -3.3079403387824401E-03, -3.0552592910038159E-03, -4.5929141335173350E-04, -1.4804264337309641E-05, -3.7419288907183740E-08}; + constexpr FLT c4[] = {3.9124194363163294E-08, 8.1265227753122970E-06, 1.4975407030324897E-04, 4.4789439277602596E-04, -7.9407521150522923E-04, -1.9254008995687538E-03, 2.1136619999319122E-03, 2.1136619999319001E-03, -1.9254008995687199E-03, -7.9407521150516136E-04, 4.4789439277602650E-04, 1.4975407030324992E-04, 8.1265227753123038E-06, 3.9124194363164128E-08}; + constexpr FLT c5[] = {2.9113992252245372E-08, 3.1458937074171777E-06, 3.0585266291431484E-05, -6.5135387342584785E-06, -2.3196510408360701E-04, 1.5778347828054417E-04, 4.2181913759730018E-04, -4.2181913759742806E-04, -1.5778347828063221E-04, 2.3196510408354706E-04, 6.5135387342537318E-06, -3.0585266291431918E-05, -3.1458937074171849E-06, -2.9113992252245408E-08}; + constexpr FLT c6[] = {1.5927753226313462E-08, 8.6591441391883373E-07, 3.1186030532596436E-06, -1.4256326863802855E-05, -6.9192418278230890E-06, 6.1786486497546967E-05, -4.4611361914771356E-05, -4.4611361914751718E-05, 6.1786486497529485E-05, -6.9192418278288107E-06, -1.4256326863804007E-05, 3.1186030532597304E-06, 8.6591441391883235E-07, 1.5927753226313955E-08}; + constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299905E-07, -1.6208737249913681E-07, -2.0005919851675999E-06, 4.6289117401734085E-06, 1.5738407906137680E-07, -1.0033756087411173E-05, 1.0033756087382978E-05, -1.5738407906832162E-07, -4.6289117402499938E-06, 2.0005919851674550E-06, 1.6208737249922011E-07, -1.6321871905298894E-07, -6.5072355972922629E-09}; + constexpr FLT c8[] = {1.9857214221989477E-09, 1.7788899565186164E-08, -1.0133541198303713E-07, 4.4566342397467425E-08, 5.3564828267041993E-07, -1.1695093255347421E-06, 6.7085595108654188E-07, 6.7085595111459169E-07, -1.1695093255478542E-06, 5.3564828275006633E-07, 4.4566342393758532E-08, -1.0133541198341660E-07, 1.7788899565174329E-08, 1.9857214221992526E-09}; + constexpr FLT c9[] = {4.4289508956509546E-10, -2.3397558745655689E-11, -1.2203541602950612E-08, 4.1555456454833893E-08, -4.0387396870689535E-08, -5.2822132694649911E-08, 1.7383889346668610E-07, -1.7383889350128520E-07, 5.2822132655898894E-08, 4.0387396825018697E-08, -4.1555456456217858E-08, 1.2203541602961425E-08, 2.3397558742445808E-11, -4.4289508956485521E-10}; + constexpr FLT c10[] = {6.7195187479837397E-11, -3.6781600571601267E-10, 1.8909214069547874E-10, 3.2074788110424279E-09, -1.0777792237178902E-08, 1.5287295344040793E-08, -7.6060393515002415E-09, -7.6060394193782061E-09, 1.5287295315129795E-08, -1.0777792256033903E-08, 3.2074788100996828E-09, 1.8909214022410232E-10, -3.6781600571938840E-10, 6.7195187480072407E-11}; + constexpr FLT c11[] = {5.1753158905849938E-12, -5.7459004383367526E-11, 2.1373772926710777E-10, -3.3474981579266315E-10, -5.5056499021679011E-11, 1.1984997640436430E-09, -2.3401534450899741E-09, 2.3401534368560627E-09, -1.1984997614882759E-09, 5.5056481773059742E-11, 3.3474981494088389E-10, -2.1373772865488849E-10, 5.7459004398339857E-11, -5.1753158903399997E-12}; + constexpr FLT c12[] = {-3.4295334314887252E-13, -1.9669733997962757E-12, 1.8829710684156337E-11, -6.6063897345211508E-11, 1.2987243531465558E-10, -1.4723141559059891E-10, 6.6816638241450007E-11, 6.6816674992423938E-11, -1.4723144009120710E-10, 1.2987246389863218E-10, -6.6063899833524000E-11, 1.8829709790898323E-11, -1.9669734189872262E-12, -3.4295334295386873E-13}; + constexpr FLT c13[] = {-1.4925032356486504E-13, 5.9101412775720763E-13, -1.0473415580082965E-12, -3.4168975321644197E-13, 6.3681265546106545E-12, -1.6773501596110874E-11, 2.5499682861090652E-11, -2.5499676599540805E-11, 1.6773491089725712E-11, -6.3681491385782352E-12, 3.4168991639689697E-13, 1.0473415086930869E-12, -5.9101412527197903E-13, 1.4925032367463167E-13}; + constexpr FLT c14[] = {-1.6512890190412886E-14, 8.8250732077299682E-14, -3.0062088735298833E-13, 6.8819261910563679E-13, -1.0710344223118919E-12, 1.0658816851325112E-12, -4.5538510635793641E-13, -4.5542586230308239E-13, 1.0658698063253250E-12, -1.0710440527620932E-12, 6.8819219968244471E-13, -3.0062102913299402E-13, 8.8250727745177781E-14, -1.6512890091669500E-14}; + constexpr FLT c15[] = {1.6573977544796275E-16, 1.3350732982407589E-15, -1.0198568872965102E-14, 3.9099748138401563E-14, -9.7788768122723911E-14, 1.7463267454485294E-13, -2.3133010081886947E-13, 2.3135108981080239E-13, -1.7462349391883023E-13, 9.7794991333538680E-14, -3.9100183236637960E-14, 1.0198818425583603E-14, -1.3350638233291884E-15, -1.6573967606346456E-16}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); - } else if (w==15) { - constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10}; - constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09}; - constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09}; - constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09}; - constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09}; - constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09}; - constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09}; - constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09}; - constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10}; - constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10}; - constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11}; - constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12}; - constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13}; - constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14}; - constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15}; - constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16}; - constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18}; + } else if constexpr (w==15) { + constexpr FLT c0[] = {2.4886236238313694E-10, 1.0156314710024858E-06, 1.5297772142853732E-04, 4.9110296377727243E-03, 5.6121982134094014E-02, 2.8670951404936723E-01, 7.3488453954210675E-01, 1.0000000000000009E+00, 7.3488453954210664E-01, 2.8670951404936756E-01, 5.6121982134094153E-02, 4.9110296377727286E-03, 1.5297772142853721E-04, 1.0156314710024835E-06, 2.4886236238313125E-10}; + constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409656E-06, 3.1470309742465672E-04, 7.1215977556942697E-03, 5.6335374470954637E-02, 1.8245542837228401E-01, 2.2739494478010158E-01, -2.1212921335912513E-17, -2.2739494478010189E-01, -1.8245542837228415E-01, -5.6335374470954748E-02, -7.1215977556942818E-03, -3.1470309742465672E-04, -3.1146031777409634E-06, -1.4880454274285361E-09}; + constexpr FLT c2[] = {4.0364738474324382E-09, 4.4152383936309391E-06, 2.9537757977456607E-04, 4.5415629108243203E-03, 2.2685962261788539E-02, 3.3502333548319357E-02, -2.2696322242195973E-02, -7.6666666666667369E-02, -2.2696322242196015E-02, 3.3502333548319246E-02, 2.2685962261788574E-02, 4.5415629108243290E-03, 2.9537757977456613E-04, 4.4152383936309408E-06, 4.0364738474324415E-09}; + constexpr FLT c3[] = {6.6006259688120969E-09, 3.8297656275654666E-06, 1.6597029248061442E-04, 1.6248331197066931E-03, 4.0281119347582005E-03, -2.8399908290139731E-03, -1.3756562885831778E-02, 2.8214012620215877E-18, 1.3756562885831887E-02, 2.8399908290139960E-03, -4.0281119347581893E-03, -1.6248331197066925E-03, -1.6597029248061439E-04, -3.8297656275654657E-06, -6.6006259688120961E-09}; + constexpr FLT c4[] = {7.2920076887968800E-09, 2.2644150332986919E-06, 6.1226481435400998E-05, 3.3216368068303718E-04, 4.2258807580337500E-07, -1.7026747228854671E-03, -1.2026158633594345E-04, 2.8537037037042021E-03, -1.2026158633592798E-04, -1.7026747228854671E-03, 4.2258807579244642E-07, 3.3216368068303610E-04, 6.1226481435401012E-05, 2.2644150332986910E-06, 7.2920076887968800E-09}; + constexpr FLT c5[] = {5.7777535593445565E-09, 9.5996306286140452E-07, 1.5097159537535414E-05, 2.8094504791460675E-05, -1.2791075475387991E-04, -1.0516749004214654E-04, 4.0040320377514744E-04, -2.9254521903171209E-17, -4.0040320377523759E-04, 1.0516749004227534E-04, 1.2791075475386646E-04, -2.8094504791466567E-05, -1.5097159537535478E-05, -9.5996306286140431E-07, -5.7777535593445590E-09}; + constexpr FLT c6[] = {3.3986627004323946E-09, 2.9741452947022095E-07, 2.3232144780589999E-06, -3.5941523174528157E-06, -1.8171775676707950E-05, 3.2858338560946018E-05, 2.0665249075229327E-05, -6.8763374485729812E-05, 2.0665249075209737E-05, 3.2858338560891984E-05, -1.8171775676692009E-05, -3.5941523174462754E-06, 2.3232144780590639E-06, 2.9741452947022190E-07, 3.3986627004323966E-09}; + constexpr FLT c7[] = {1.5128957992049979E-09, 6.6672685257785398E-08, 1.4160936684828114E-07, -1.2611166225393906E-06, 6.6865545482379982E-07, 4.6861078169658144E-06, -7.4523870622697214E-06, -1.4396126433923030E-18, 7.4523870622821872E-06, -4.6861078171786771E-06, -6.6865545482258369E-07, 1.2611166225357075E-06, -1.4160936684821783E-07, -6.6672685257784604E-08, -1.5128957992049966E-09}; + constexpr FLT c8[] = {5.1310324414219137E-10, 1.0163871982746801E-08, -2.4441175134521862E-08, -1.0543632600130600E-07, 4.0979777877938960E-07, -2.9573937051795272E-07, -5.9824625884976530E-07, 1.2067769776046611E-06, -5.9824625884104414E-07, -2.9573937054198998E-07, 4.0979777874105681E-07, -1.0543632599939765E-07, -2.4441175134571291E-08, 1.0163871982744987E-08, 5.1310324414218920E-10}; + constexpr FLT c9[] = {1.3160883866734097E-10, 8.0584478671573275E-10, -6.7824252839632764E-09, 9.4471403078850365E-09, 2.4030590209056248E-08, -9.0522548467097150E-08, 9.9320303311968977E-08, 6.4801334061299793E-18, -9.9320303306433127E-08, 9.0522548602725707E-08, -2.4030590186912812E-08, -9.4471403125559170E-09, 6.7824252839227305E-09, -8.0584478671577494E-10, -1.3160883866734361E-10}; + constexpr FLT c10[] = {2.4734066313994419E-11, -4.3978001545939149E-11, -5.4975091409873009E-10, 2.6307942047565849E-09, -4.2001676325554839E-09, -1.8212710984882958E-10, 1.0547608679845328E-08, -1.6454374715311988E-08, 1.0547608656276595E-08, -1.8212716201433033E-10, -4.2001676495249784E-09, 2.6307942074277104E-09, -5.4975091408891053E-10, -4.3978001545785610E-11, 2.4734066313995337E-11}; + constexpr FLT c11[] = {3.0917581107104265E-12, -2.1504981480418787E-11, 3.4611945905203146E-11, 1.1082666507373995E-10, -5.8883840835118001E-10, 1.1304780041635263E-09, -1.0037911054750330E-09, 1.1228415117366755E-17, 1.0037911049071899E-09, -1.1304781011248302E-09, 5.8883843035557056E-10, -1.1082666656435723E-10, -3.4611945847530977E-11, 2.1504981481389192E-11, -3.0917581107093485E-12}; + constexpr FLT c12[] = {1.5997634038637621E-13, -2.4807970161781935E-12, 1.1275106698059214E-11, -2.3847055079858343E-11, 1.5364460008330925E-11, 4.4350562320757153E-11, -1.3563506321257987E-10, 1.8159081840923697E-10, -1.3563507137937324E-10, 4.4350478610403863E-11, 1.5364414580198884E-11, -2.3847054824637823E-11, 1.1275106650202108E-11, -2.4807970177976851E-12, 1.5997634038475929E-13}; + constexpr FLT c13[] = {-2.4800914618144935E-14, -2.0428592100498625E-14, 6.6720752625942027E-13, -2.9781126552009906E-12, 7.0947564462517667E-12, -1.0181661180942040E-11, 7.9189163961032134E-12, 3.7608949396856996E-18, -7.9189226029666706E-12, 1.0181683671234862E-11, -7.0947475521936810E-12, 2.9781099081789285E-12, -6.6720753193899847E-13, 2.0428592265940106E-14, 2.4800914617314154E-14}; + constexpr FLT c14[] = {-6.3774103676221929E-15, 2.8974955310173576E-14, -6.8422373638576966E-14, 5.3398642699701678E-14, 1.7893165730225197E-13, -7.2418999429507432E-13, 1.3713100656893202E-12, -1.6687865627694359E-12, 1.3712960091978784E-12, -7.2420254070755149E-13, 1.7892100253036837E-13, 5.3399727540394014E-14, -6.8422372503399220E-14, 2.8974955138269160E-14, -6.3774103674607597E-15}; + constexpr FLT c15[] = {-5.1635500234731389E-16, 3.1828109237030503E-15, -1.2111357125137462E-14, 3.1272789056801543E-14, -5.6177765027145775E-14, 6.8664184316750068E-14, -4.9010181633415173E-14, 1.5014404617227298E-17, 4.9025280117573159E-14, -6.8658963234056455E-14, 5.6191826354025916E-14, -3.1272848944295054E-14, 1.2111379512738901E-14, -3.1828103838144419E-15, 5.1635500252288918E-16}; + constexpr FLT c16[] = {4.5179133906197547E-18, -1.3721326906232784E-17, -2.0186815184489892E-16, 1.1791054601203943E-15, -3.5949764000106320E-15, 7.4864168939925667E-15, -1.1419241866991483E-14, 1.2981541205285921E-14, -1.1427309516992720E-14, 7.4238789517193859E-15, -3.6185770943210024E-15, 1.1784710847541274E-15, -2.0188535534452587E-16, -1.3721947132261775E-17, 4.5179130340389477E-18}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); - } else if (w==16) { - constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11}; - constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10}; - constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10}; - constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09}; - constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09}; - constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09}; - constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10}; - constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10}; - constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10}; - constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11}; - constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12}; - constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12}; - constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13}; - constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15}; - constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15}; - constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16}; - constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17}; - constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18}; + } else if constexpr (w==16) { + constexpr FLT c0[] = {3.7973138383476054E-11, 2.1620729770457883E-07, 4.2059935922517646E-05, 1.7055631615451748E-03, 2.4507833223051369E-02, 1.5833750021928350E-01, 5.2065761855025561E-01, 9.3058177132107789E-01, 9.3058177132107767E-01, 5.2065761855025572E-01, 1.5833750021928350E-01, 2.4507833223051376E-02, 1.7055631615451737E-03, 4.2059935922517646E-05, 2.1620729770457798E-07, 3.7973138383474981E-11}; + constexpr FLT c1[] = {2.3529614069937337E-10, 6.9307767643753052E-07, 9.1584555859393192E-05, 2.6688190455647245E-03, 2.7424935799146767E-02, 1.1980519064171591E-01, 2.2858769149343958E-01, 1.3403316930972956E-01, -1.3403316930972969E-01, -2.2858769149343969E-01, -1.1980519064171592E-01, -2.7424935799146788E-02, -2.6688190455647237E-03, -9.1584555859393192E-05, -6.9307767643753021E-07, -2.3529614069937312E-10}; + constexpr FLT c2[] = {6.6422278409342432E-10, 1.0324321112746623E-06, 9.1817488865684715E-05, 1.8711533829047159E-03, 1.2921996060610223E-02, 3.2047854205940286E-02, 1.0693035516337665E-02, -5.7626889750985524E-02, -5.7626889750985545E-02, 1.0693035516337648E-02, 3.2047854205940265E-02, 1.2921996060610218E-02, 1.8711533829047161E-03, 9.1817488865684715E-05, 1.0324321112746627E-06, 6.6422278409342463E-10}; + constexpr FLT c3[] = {1.1357078950958122E-09, 9.4728532805183445E-07, 5.5827161828283886E-05, 7.6087086075588375E-04, 3.0946204357507577E-03, 1.6729582927767672E-03, -9.5127691406673553E-03, -8.9630953638634506E-03, 8.9630953638635338E-03, 9.5127691406673709E-03, -1.6729582927767488E-03, -3.0946204357507586E-03, -7.6087086075588310E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958117E-09}; + constexpr FLT c4[] = {1.3190161602522569E-09, 5.9764321317063357E-07, 2.2744388605472973E-05, 1.9073517322668065E-04, 2.8943142766412486E-04, -8.8625893129446582E-04, -1.3389167739521499E-03, 1.7216657535079746E-03, 1.7216657535078564E-03, -1.3389167739520829E-03, -8.8625893129448414E-04, 2.8943142766412616E-04, 1.9073517322668041E-04, 2.2744388605472987E-05, 5.9764321317063336E-07, 1.3190161602522567E-09}; + constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058849E-07, 6.4277990516969554E-06, 2.7144256967439121E-05, -3.6927862875725191E-05, -1.6756539822667264E-04, 1.6190404775905833E-04, 2.9203183363564066E-04, -2.9203183363584427E-04, -1.6190404775914360E-04, 1.6756539822664743E-04, 3.6927862875715480E-05, -2.7144256967439711E-05, -6.4277990516969706E-06, -2.7364351668058875E-07, -1.1057322032863290E-09}; + constexpr FLT c6[] = {6.9354916180818924E-10, 9.3269475195063537E-08, 1.2384428187212263E-06, 8.4996778392738158E-07, -1.3106613626292629E-05, 2.8218026703861212E-06, 4.1119875273697444E-05, -3.3017437945430103E-05, -3.3017437945470896E-05, 4.1119875273652538E-05, 2.8218026703968379E-06, -1.3106613626296861E-05, 8.4996778392750302E-07, 1.2384428187212206E-06, 9.3269475195063789E-08, 6.9354916180818945E-10}; + constexpr FLT c7[] = {3.3254260763956011E-10, 2.3748169129617005E-08, 1.4324995919585297E-07, -4.5855119979452145E-07, -9.5896649524105156E-07, 3.6155491754854889E-06, -9.8206137491907898E-07, -6.1812989820315811E-06, 6.1812989820368785E-06, 9.8206137490212540E-07, -3.6155491754921813E-06, 9.5896649524339805E-07, 4.5855119979457609E-07, -1.4324995919584945E-07, -2.3748169129616866E-08, -3.3254260763956011E-10}; + constexpr FLT c8[] = {1.2320735888479526E-10, 4.4066719437556721E-09, 2.9936173156487350E-09, -8.7082338359330349E-08, 1.2972939456230955E-07, 2.2882425902226497E-07, -7.3491924911512702E-07, 4.5592445665545743E-07, 4.5592445660747756E-07, -7.3491924910696321E-07, 2.2882425899779496E-07, 1.2972939455792574E-07, -8.7082338359903181E-08, 2.9936173156278789E-09, 4.4066719437554389E-09, 1.2320735888479485E-10}; + constexpr FLT c9[] = {3.5284250010876550E-11, 5.4380355945604351E-10, -2.1550460241930885E-09, -3.7344953352820494E-09, 2.7722604309424572E-08, -3.9597167050294061E-08, -1.3993916672829409E-08, 9.5626629212869654E-08, -9.5626629290371686E-08, 1.3993916672829409E-08, 3.9597167029534587E-08, -2.7722604309424572E-08, 3.7344953347198139E-09, 2.1550460241883579E-09, -5.4380355945631803E-10, -3.5284250010876550E-11}; + constexpr FLT c10[] = {7.7013760205811189E-12, 2.8123297626102908E-11, -3.7953802134156255E-10, 8.7573780433573970E-10, 5.1359846138839095E-10, -5.3609157638048723E-09, 9.1303304391922814E-09, -4.8150451469735769E-09, -4.8150451821695577E-09, 9.1303303785420424E-09, -5.3609157562628785E-09, 5.1359845683177007E-10, 8.7573780433573908E-10, -3.7953802135138315E-10, 2.8123297626198877E-11, 7.7013760205813484E-12}; + constexpr FLT c11[] = {1.2276300481457118E-12, -4.1769601372154706E-12, -1.9148402788514276E-11, 1.3822953664496548E-10, -3.0994363825898090E-10, 2.0316700162393117E-10, 4.3650570799977785E-10, -1.1534087411135158E-09, 1.1534087067582605E-09, -4.3650572872648811E-10, -2.0316702142789328E-10, 3.0994363982058370E-10, -1.3822953669820091E-10, 1.9148402794059297E-11, 4.1769601372328333E-12, -1.2276300481457027E-12}; + constexpr FLT c12[] = {1.2527329159238077E-13, -1.0816725478673842E-12, 2.7445378747019394E-12, 1.7839891961657867E-12, -2.6194652895800313E-11, 6.7446677647394068E-11, -8.5082148942419491E-11, 4.0255039228341312E-11, 4.0255014727829503E-11, -8.5082153025857524E-11, 6.7446659271937760E-11, -2.6194659020925783E-11, 1.7839886857347582E-12, 2.7445378507741848E-12, -1.0816725480072234E-12, 1.2527329159219117E-13}; + constexpr FLT c13[] = {3.2506946752584284E-15, -9.2845382012734022E-14, 5.1542690289651931E-13, -1.3678933550774322E-12, 1.6503394000353190E-12, 7.2548222075468971E-13, -6.2314783158857931E-12, 1.1299385029049823E-11, -1.1299412389378464E-11, 6.2314862503466412E-12, -7.2548671567597581E-13, -1.6503386065891033E-12, 1.3678930680384301E-12, -5.1542690417907497E-13, 9.2845381914947089E-14, -3.2506946753557064E-15}; + constexpr FLT c14[] = {-1.3523251102660285E-15, 1.9055798040664957E-15, 1.8430801809977217E-14, -1.1526991298447018E-13, 3.3348650134861527E-13, -5.8352626545764598E-13, 6.1749039614345018E-13, -2.7107869685672170E-13, -2.7109332004657512E-13, 6.1745916976419674E-13, -5.8352477326021082E-13, 3.3348496582397061E-13, -1.1526993239191820E-13, 1.8430798249970336E-14, 1.9055798150784046E-15, -1.3523251102071566E-15}; + constexpr FLT c15[] = {-2.4132931367237367E-16, 1.2442654957274558E-15, -3.5592524044067016E-15, 5.0959422961477296E-15, 1.6457864483481524E-15, -2.5291879718574400E-14, 6.2721101216839530E-14, -9.2634489356235131E-14, 9.2635738460845619E-14, -6.2719649383821551E-14, 2.5285060095240627E-14, -1.6450340241546609E-15, -5.0959072562256486E-15, 3.5592539403047753E-15, -1.2442654850132002E-15, 2.4132931369394181E-16}; + constexpr FLT c16[] = {-1.6052119792140307E-17, 1.0220935606483716E-16, -4.3668256960360204E-16, 1.2660776751946750E-15, -2.5887248648677308E-15, 3.7440553205498428E-15, -3.5013660863304473E-15, 1.4138716423669956E-15, 1.4075851421411128E-15, -3.5039069022876658E-15, 3.7371008060361623E-15, -2.5918828508224464E-15, 1.2658767163553955E-15, -4.3669235527279705E-16, 1.0220930333729136E-16, -1.6052119836268989E-17}; + constexpr FLT c17[] = {1.2307507765549609E-18, -2.6518644362922344E-18, -1.2515545353008470E-20, 2.6927650309286786E-17, -1.1524664511280185E-16, 2.7886896016078674E-16, -4.6727358397732244E-16, 6.5982664751885646E-16, -6.6441094913127157E-16, 4.6905833790076952E-16, -2.7865574777849670E-16, 1.1538444947333928E-16, -2.6987584488723839E-17, 1.2060311781417857E-20, 2.6518475285441206E-18, -1.2307508043854425E-18}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc index e2fa229b7..1731dadbe 100644 --- a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc @@ -1,171 +1,171 @@ // Code generated by gen_all_horner_C_code.m in finufft/devel // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) The Simons Foundation, Inc. - if (w==2) { - constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01}; - constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01}; - constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02}; - constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01}; + if constexpr (w==2) { + constexpr FLT c0[] = {6.1209111871385669E-01, 6.1209111871385680E-01}; + constexpr FLT c1[] = {6.4742429432896387E-01, -6.4742429432896387E-01}; + constexpr FLT c2[] = {-9.0411309581634763E-02, -9.0411309581634819E-02}; + constexpr FLT c3[] = {-1.9075708590566753E-01, 1.9075708590566756E-01}; for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); - } else if (w==3) { - constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01}; - constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01}; - constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01}; - constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02}; - constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02}; + } else if constexpr (w==3) { + constexpr FLT c0[] = {2.4728112933307073E-01, 1.0000000000000040E+00, 2.4728112935494936E-01}; + constexpr FLT c1[] = {4.0470611346184499E-01, -1.6970337068730035E-16, -4.0470611343822127E-01}; + constexpr FLT c2[] = {1.4864411342268652E-01, -3.0473448739822778E-01, 1.4864411344492170E-01}; + constexpr FLT c3[] = {-4.4469294619149682E-02, 4.8015385165414327E-17, 4.4469294640111567E-02}; + constexpr FLT c4[] = {-2.9270010751775020E-02, 3.7966707032750471E-02, -2.9270010728701217E-02}; for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); - } else if (w==4) { - constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02}; - constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01}; - constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01}; - constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02}; - constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02}; - constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03}; + } else if constexpr (w==4) { + constexpr FLT c0[] = {8.4048892491849742E-02, 7.9275732207620841E-01, 7.9275732207620875E-01, 8.4048892491849783E-02}; + constexpr FLT c1[] = {1.7431588385887223E-01, 3.7425489538028367E-01, -3.7425489538028395E-01, -1.7431588385887223E-01}; + constexpr FLT c2[] = {1.1425598262146335E-01, -1.1126112046907141E-01, -1.1126112046907129E-01, 1.1425598262146333E-01}; + constexpr FLT c3[] = {1.5677587697716069E-02, -6.7022293289915769E-02, 6.7022293289915741E-02, -1.5677587697716048E-02}; + constexpr FLT c4[] = {-1.0401300825285637E-02, 6.3725646657137903E-03, 6.3725646657138996E-03, -1.0401300825285627E-02}; + constexpr FLT c5[] = {-3.0464394190490968E-03, 5.3247889205096637E-03, -5.3247889205097453E-03, 3.0464394190490743E-03}; for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==5) { - constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02}; - constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02}; - constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02}; - constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02}; - constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05}; - constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03}; + } else if constexpr (w==5) { + constexpr FLT c0[] = {2.5811126752233300E-02, 4.6616226852477310E-01, 1.0000000000000004E+00, 4.6616226852477283E-01, 2.5811126752233279E-02}; + constexpr FLT c1[] = {6.2936773057387013E-02, 3.7198919402373987E-01, -1.2727752801547548E-16, -3.7198919402373976E-01, -6.2936773057386999E-02}; + constexpr FLT c2[] = {5.4855980576944546E-02, 3.7709308632020579E-02, -1.8284069243892631E-01, 3.7709308632020731E-02, 5.4855980576944560E-02}; + constexpr FLT c3[] = {1.8780973157032137E-02, -3.8322611720715702E-02, -3.0072186542941280E-17, 3.8322611720715730E-02, -1.8780973157032126E-02}; + constexpr FLT c4[] = {-2.3306908700113508E-05, -8.3858973028989696E-03, 1.4886952481383579E-02, -8.3858973028989054E-03, -2.3306908700115385E-05}; + constexpr FLT c5[] = {-1.5212353034890040E-03, 1.7151925122364340E-03, -1.0820256419442042E-16, -1.7151925122365530E-03, 1.5212353034889936E-03}; for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==6) { - constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03}; - constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02}; - constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02}; - constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02}; - constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03}; - constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04}; - constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04}; + } else if constexpr (w==6) { + constexpr FLT c0[] = {7.3992041846532739E-03, 2.2998056434514014E-01, 8.5775196559356071E-01, 8.5775196559356048E-01, 2.2998056434514008E-01, 7.3992041847815992E-03}; + constexpr FLT c1[] = {2.0397684222696229E-02, 2.4277466601214714E-01, 2.6509440217151259E-01, -2.6509440217151231E-01, -2.4277466601214720E-01, -2.0397684222557680E-02}; + constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239919E-02, -9.5369600014193437E-02, -9.5369600014193354E-02, 7.4190333865239877E-02, 2.1435449512163880E-02}; + constexpr FLT c3[] = {1.0463664645794034E-02, -5.8671703446042016E-03, -3.4019677093840517E-02, 3.4019677093840572E-02, 5.8671703446042381E-03, -1.0463664645671085E-02}; + constexpr FLT c4[] = {1.9378826192716935E-03, -6.8365127179467822E-03, 4.7406536657958352E-03, 4.7406536657957372E-03, -6.8365127179468334E-03, 1.9378826194070343E-03}; + constexpr FLT c5[] = {-2.6471424081648518E-04, -5.6150758897076337E-04, 2.0099203466670337E-03, -2.0099203466670832E-03, 5.6150758897070309E-04, 2.6471424094084089E-04}; + constexpr FLT c6[] = {-1.6161497824910900E-04, 2.5924418389357582E-04, -1.3917099193220238E-04, -1.3917099193215624E-04, 2.5924418389355257E-04, -1.6161497812640228E-04}; for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==7) { - constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03}; - constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03}; - constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03}; - constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03}; - constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03}; - constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04}; - constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05}; - constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05}; + } else if constexpr (w==7) { + constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045124E-01, 5.8653557849806104E-01, 1.0000000000000000E+00, 5.8653557849806159E-01, 1.0071602557045128E-01, 2.0163149399332544E-03}; + constexpr FLT c1[] = {6.1353661835569159E-03, 1.2822551681002697E-01, 3.1973557271594322E-01, -8.4851685343650348E-17, -3.1973557271594344E-01, -1.2822551681002697E-01, -6.1353661835202083E-03}; + constexpr FLT c2[] = {7.4065234100227735E-03, 5.7825030729344376E-02, 1.0889852837580607E-04, -1.3060049459923290E-01, 1.0889852837579503E-04, 5.7825030729344355E-02, 7.4065234100573743E-03}; + constexpr FLT c3[] = {4.4924606632387740E-03, 7.2245566707420956E-03, -2.7743312484355645E-02, 4.8062139167337038E-18, 2.7743312484355763E-02, -7.2245566707421025E-03, -4.4924606632061890E-03}; + constexpr FLT c4[] = {1.3572774007773838E-03, -2.3954706749181576E-03, -2.9058644824981614E-03, 7.8619155407044367E-03, -2.9058644824981484E-03, -2.3954706749181576E-03, 1.3572774008132609E-03}; + constexpr FLT c5[] = {1.1260116639581245E-04, -7.8814564904714066E-04, 1.1036556706848430E-03, -1.0256120645547333E-16, -1.1036556706849785E-03, 7.8814564904712190E-04, -1.1260116636284711E-04}; + constexpr FLT c6[] = {-4.7399003259807427E-05, 2.0950491942974621E-06, 1.7484854214657133E-04, -2.9104069274777630E-04, 1.7484854214657890E-04, 2.0950491942817212E-06, -4.7399003227280379E-05}; + constexpr FLT c7[] = {-1.2555096177145360E-05, 2.7293834771960423E-05, -2.6660039700455961E-05, -5.0941873861390392E-17, 2.6660039700535064E-05, -2.7293834771960725E-05, 1.2555096209060891E-05}; for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==8) { - constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04}; - constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03}; - constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03}; - constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03}; - constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04}; - constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04}; - constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07}; - constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06}; + } else if constexpr (w==8) { + constexpr FLT c0[] = {5.2827275612461451E-04, 4.0402734444109217E-02, 3.4389230803369680E-01, 8.9161099745784878E-01, 8.9161099745784844E-01, 3.4389230803369675E-01, 4.0402734444109217E-02, 5.2827275612461397E-04}; + constexpr FLT c1[] = {1.7458301875074075E-03, 5.9145446836664499E-02, 2.5435204236257836E-01, 2.0538938722823202E-01, -2.0538938722823219E-01, -2.5435204236257830E-01, -5.9145446836664492E-02, -1.7458301875074073E-03}; + constexpr FLT c2[] = {2.3525728171808302E-03, 3.3585505340219701E-02, 4.4733940386002168E-02, -8.0668262921248846E-02, -8.0668262921248762E-02, 4.4733940386002147E-02, 3.3585505340219694E-02, 2.3525728171808306E-03}; + constexpr FLT c3[] = {1.6676293877589689E-03, 8.1606118103203923E-03, -1.0603838868224458E-02, -2.0559571166483832E-02, 2.0559571166483812E-02, 1.0603838868224439E-02, -8.1606118103203784E-03, -1.6676293877589683E-03}; + constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102773115E-05, -4.0842122325118468E-03, 3.3746160664394906E-03, 3.3746160664395036E-03, -4.0842122325118451E-03, 5.7029826102771821E-05, 6.5470478006265345E-04}; + constexpr FLT c5[] = {1.2504911757628621E-04, -3.9351755557268033E-04, 2.3739384784364383E-05, 9.6592347103011448E-04, -9.6592347103028199E-04, -2.3739384784413348E-05, 3.9351755557267193E-04, -1.2504911757628642E-04}; + constexpr FLT c6[] = {-6.5665874015840706E-07, -6.1884865695206498E-05, 1.4476791315354124E-04, -8.6782118193463355E-05, -8.6782118193402314E-05, 1.4476791315355117E-04, -6.1884865695212461E-05, -6.5665874015819805E-07}; + constexpr FLT c7[] = {-5.1256159860503314E-06, 5.3292178505928552E-06, 8.7427989025428753E-06, -2.8404799465120154E-05, 2.8404799465045320E-05, -8.7427989025463803E-06, -5.3292178505922614E-06, 5.1256159860508015E-06}; for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==9) { - constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04}; - constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04}; - constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04}; - constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04}; - constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04}; - constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05}; - constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06}; - constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07}; - constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07}; + } else if constexpr (w==9) { + constexpr FLT c0[] = {1.3409415535124453E-04, 1.5141199617983750E-02, 1.8004032483820073E-01, 6.6268423293859624E-01, 1.0000000000000000E+00, 6.6268423293859713E-01, 1.8004032483820073E-01, 1.5141199617983806E-02, 1.3409415535124423E-04}; + constexpr FLT c1[] = {4.7572953640583380E-04, 2.4761567630011024E-02, 1.6332247709293538E-01, 2.7616213278983187E-01, -1.2727752801547551E-16, -2.7616213278983220E-01, -1.6332247709293540E-01, -2.4761567630011090E-02, -4.7572953640583358E-04}; + constexpr FLT c2[] = {7.0217948741779822E-04, 1.6533012331430414E-02, 4.8637875368588435E-02, -1.5084170630533118E-02, -1.0157816246607025E-01, -1.5084170630533399E-02, 4.8637875368588449E-02, 1.6533012331430449E-02, 7.0217948741779855E-04}; + constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067802972E-03, 8.8722695781043152E-04, -2.0386313118366337E-02, 7.7339938432418739E-17, 2.0386313118366497E-02, -8.8722695781042523E-04, -5.4583505067803042E-03, -5.6197289626769623E-04}; + constexpr FLT c4[] = {2.6358216867957519E-04, 7.0803132065996725E-04, -2.3883045659485484E-03, -1.0047843626593107E-03, 4.8455486978738038E-03, -1.0047843626591504E-03, -2.3883045659485562E-03, 7.0803132065996670E-04, 2.6358216867957530E-04}; + constexpr FLT c5[] = {7.0565721004957736E-05, -9.0876125855053581E-05, -3.5965836571498310E-04, 7.0575785995715778E-04, -1.7863253875768360E-16, -7.0575785995748987E-04, 3.5965836571493544E-04, 9.0876125855052036E-05, -7.0565721004957886E-05}; + constexpr FLT c6[] = {7.9668965137352579E-06, -4.2137454928178333E-05, 3.9856859670039154E-05, 6.5639620808859397E-05, -1.4477186949857812E-04, 6.5639620808726203E-05, 3.9856859670047136E-05, -4.2137454928184303E-05, 7.9668965137354324E-06}; + constexpr FLT c7[] = {-9.3772917893887112E-07, -3.0575635011687597E-06, 1.2977675432519954E-05, -1.5241881422255193E-05, 1.6602960103597961E-17, 1.5241881422356022E-05, -1.2977675432524160E-05, 3.0575635011788267E-06, 9.3772917893911581E-07}; + constexpr FLT c8[] = {-4.1446092652953487E-07, 7.2790527337856297E-07, -2.5130319741045627E-08, -1.9002349620285069E-06, 3.0493470975570654E-06, -1.9002349619475500E-06, -2.5130319785472300E-08, 7.2790527336840831E-07, -4.1446092652968940E-07}; for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==10) { - constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05}; - constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04}; - constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04}; - constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04}; - constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05}; - constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05}; - constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06}; - constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07}; - constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07}; - constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08}; + } else if constexpr (w==10) { + constexpr FLT c0[] = {3.3157481538170376E-05, 5.3715860775974400E-03, 8.6328042282845727E-02, 4.3077092326437977E-01, 9.1242439930731056E-01, 9.1242439930731056E-01, 4.3077092326437955E-01, 8.6328042282845727E-02, 5.3715860775974157E-03, 3.3157481538170227E-05}; + constexpr FLT c1[] = {1.2517797191066970E-04, 9.6269418565961325E-03, 9.1130577457178341E-02, 2.4769645835465337E-01, 1.6766875916810506E-01, -1.6766875916810528E-01, -2.4769645835465337E-01, -9.1130577457178355E-02, -9.6269418565961048E-03, -1.2517797191066943E-04}; + constexpr FLT c2[] = {1.9968216068682134E-04, 7.2783782301876591E-03, 3.5949398124193926E-02, 2.5847993600195424E-02, -6.9275634160640739E-02, -6.9275634160640559E-02, 2.5847993600195397E-02, 3.5949398124193920E-02, 7.2783782301876392E-03, 1.9968216068682104E-04}; + constexpr FLT c3[] = {1.7649923565147245E-04, 2.9221990881931085E-03, 4.9086823797165040E-03, -1.0940556313145984E-02, -1.3762152424114743E-02, 1.3762152424114974E-02, 1.0940556313146011E-02, -4.9086823797165093E-03, -2.9221990881930998E-03, -1.7649923565147212E-04}; + constexpr FLT c4[] = {9.4710355505531893E-05, 6.0621452710061488E-04, -7.0118560592791126E-04, -2.4750745659639292E-03, 2.4757076628500714E-03, 2.4757076628500606E-03, -2.4750745659640741E-03, -7.0118560592790562E-04, 6.0621452710060925E-04, 9.4710355505531690E-05}; + constexpr FLT c5[] = {3.1258610702677723E-05, 2.8169545035122152E-05, -2.9881406711978152E-04, 1.5956798534229568E-04, 5.3653099874320198E-04, -5.3653099874341719E-04, -1.5956798534230856E-04, 2.9881406711976336E-04, -2.8169545035121613E-05, -3.1258610702677696E-05}; + constexpr FLT c6[] = {5.7780052154064796E-06, -1.5636835808663068E-05, -1.6121807313045479E-05, 8.1230533420404926E-05, -5.5456530742904635E-05, -5.5456530742895771E-05, 8.1230533420387498E-05, -1.6121807313057897E-05, -1.5636835808665389E-05, 5.7780052154065305E-06}; + constexpr FLT c7[] = {2.7742147829407355E-07, -3.2550081973293079E-06, 5.9212960378138431E-06, 8.5495977198199636E-07, -1.3248468528094739E-05, 1.3248468528222524E-05, -8.5495977193841556E-07, -5.9212960378138668E-06, 3.2550081973309296E-06, -2.7742147829403215E-07}; + constexpr FLT c8[] = {-1.2089379439828256E-07, -3.4743143855636537E-08, 8.2889801007672424E-07, -1.5830293784962681E-06, 8.7461219394036447E-07, 8.7461219389667727E-07, -1.5830293787035047E-06, 8.2889801007086076E-07, -3.4743143856988507E-08, -1.2089379439837777E-07}; + constexpr FLT c9[] = {-2.5033479260940706E-08, 6.3042298324612022E-08, -5.2303271564055656E-08, -7.6226091805053197E-08, 2.3316553100830421E-07, -2.3316553113839693E-07, 7.6226091857643871E-08, 5.2303271557135834E-08, -6.3042298326514974E-08, 2.5033479260977198E-08}; for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==11) { - constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06}; - constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05}; - constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05}; - constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05}; - constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05}; - constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05}; - constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06}; - constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07}; - constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09}; - constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09}; + } else if constexpr (w==11) { + constexpr FLT c0[] = {8.0191950887587910E-06, 1.8211144887695892E-03, 3.8565497751765689E-02, 2.5236459439543657E-01, 7.1517256669690410E-01, 9.9999999999999967E-01, 7.1517256669690377E-01, 2.5236459439543635E-01, 3.8565497751765709E-02, 1.8211144887695895E-03, 8.0191950887586503E-06}; + constexpr FLT c1[] = {3.1996260415636067E-05, 3.5282769389657614E-03, 4.5889527487056443E-02, 1.8012194355267477E-01, 2.4178022040260364E-01, -4.2425842671825205E-17, -2.4178022040260411E-01, -1.8012194355267475E-01, -4.5889527487056443E-02, -3.5282769389657627E-03, -3.1996260415635823E-05}; + constexpr FLT c2[] = {5.4612928019025129E-05, 2.9497743530118277E-03, 2.1858479505161194E-02, 3.8333708936616494E-02, -2.1641923687039349E-02, -8.3109405654057333E-02, -2.1641923687039270E-02, 3.8333708936616494E-02, 2.1858479505161191E-02, 2.9497743530118282E-03, 5.4612928019024878E-05}; + constexpr FLT c3[] = {5.2504054888010184E-05, 1.3660648269306136E-03, 4.7357572177382659E-03, -2.2373255422689052E-03, -1.5459233729560849E-02, 3.9859385791562669E-17, 1.5459233729560927E-02, 2.2373255422689529E-03, -4.7357572177382625E-03, -1.3660648269306131E-03, -5.2504054888009940E-05}; + constexpr FLT c4[] = {3.1396100602888591E-05, 3.6443237253636095E-04, 1.5906780001786113E-04, -1.9495384184342966E-03, -2.4621376046567064E-04, 3.2818730060398637E-03, -2.4621376046565297E-04, -1.9495384184343234E-03, 1.5906780001786160E-04, 3.6443237253636095E-04, 3.1396100602888456E-05}; + constexpr FLT c5[] = {1.2057435171015747E-05, 4.6687328398361512E-05, -1.3963494372749242E-04, -1.4877651674418728E-04, 4.6954815721681679E-04, -3.4103624816061268E-17, -4.6954815721704350E-04, 1.4877651674414814E-04, 1.3963494372748253E-04, -4.6687328398363077E-05, -1.2057435171015720E-05}; + constexpr FLT c6[] = {2.8888404081262352E-06, -1.8976367884810240E-06, -2.4767547607265331E-05, 3.8337725458121597E-05, 2.6462355617062251E-05, -8.2113719362918427E-05, 2.6462355616970796E-05, 3.8337725458125717E-05, -2.4767547607267571E-05, -1.8976367884806328E-06, 2.8888404081262331E-06}; + constexpr FLT c7[] = {3.5729663467788187E-07, -1.6085054296200635E-06, 4.5672370508001001E-07, 6.0608527683362192E-06, -9.0233724844422414E-06, -6.1104635753534212E-18, 9.0233724844321685E-06, -6.0608527683014282E-06, -4.5672370507820282E-07, 1.6085054296207566E-06, -3.5729663467788679E-07}; + constexpr FLT c8[] = {-7.7890073973106060E-09, -1.8340559948668337E-07, 5.4451797329480613E-07, -3.5830285713357727E-07, -7.3873233537270032E-07, 1.4648976902855600E-06, -7.3873233541090648E-07, -3.5830285716766383E-07, 5.4451797328800011E-07, -1.8340559948752225E-07, -7.7890073973213941E-09}; + constexpr FLT c9[] = {-9.8984999695258830E-09, 1.0194946773891284E-08, 3.5279000671976206E-08, -1.1638771468206141E-07, 1.2326133615274268E-07, -5.5758054629440675E-18, -1.2326133614167095E-07, 1.1638771462947075E-07, -3.5279000679588014E-08, -1.0194946774453520E-08, 9.8984999695123669E-09}; for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==12) { - constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06}; - constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06}; - constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05}; - constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05}; - constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06}; - constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06}; - constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06}; - constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07}; - constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08}; - constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09}; - constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10}; + } else if constexpr (w==12) { + constexpr FLT c0[] = {1.9028495068410063E-06, 5.9416527261081902E-04, 1.6248140264385581E-02, 1.3597036436097903E-01, 4.9821957378204806E-01, 9.2652305802242907E-01, 9.2652305802242907E-01, 4.9821957378204812E-01, 1.3597036436097931E-01, 1.6248140264385608E-02, 5.9416527261081859E-04, 1.9028495068454166E-06}; + constexpr FLT c1[] = {7.9801239249145855E-06, 1.2318344820958845E-03, 2.1335987794357178E-02, 1.1394981969310444E-01, 2.3520579283187454E-01, 1.4166451219687673E-01, -1.4166451219687676E-01, -2.3520579283187454E-01, -1.1394981969310453E-01, -2.1335987794357209E-02, -1.2318344820958838E-03, -7.9801239249098489E-06}; + constexpr FLT c2[] = {1.4462226804444722E-05, 1.1205076408888253E-03, 1.1698445222077601E-02, 3.3958877046121619E-02, 1.3705098421608761E-02, -6.0497400607811530E-02, -6.0497400607811475E-02, 1.3705098421608780E-02, 3.3958877046121570E-02, 1.1698445222077622E-02, 1.1205076408888253E-03, 1.4462226804449270E-05}; + constexpr FLT c3[] = {1.4953735432776095E-05, 5.8049865432805109E-04, 3.2684769908807700E-03, 2.3619245295514145E-03, -1.0074268581043166E-02, -9.8551520939613255E-03, 9.8551520939613533E-03, 1.0074268581043166E-02, -2.3619245295513481E-03, -3.2684769908807652E-03, -5.8049865432805131E-04, -1.4953735432771914E-05}; + constexpr FLT c4[] = {9.7900673700200693E-06, 1.8351475200221882E-04, 3.8725987583789119E-04, -9.2229408802590768E-04, -1.5383560041742821E-03, 1.8800996948120981E-03, 1.8800996948121530E-03, -1.5383560041743090E-03, -9.2229408802591386E-04, 3.8725987583788620E-04, 1.8351475200221876E-04, 9.7900673700247567E-06}; + constexpr FLT c5[] = {4.2345162286123860E-06, 3.3664241555333734E-05, -3.0535096226560294E-05, -1.9795772057292226E-04, 1.7526295499594152E-04, 3.2830037656731109E-04, -3.2830037656744667E-04, -1.7526295499600649E-04, 1.9795772057293708E-04, 3.0535096226557868E-05, -3.3664241555334059E-05, -4.2345162286081238E-06}; + constexpr FLT c6[] = {1.2088615636792264E-06, 2.2204932634071259E-06, -1.5559909809165443E-05, 1.8771595438476344E-06, 4.7304527720868753E-05, -3.7055029721580005E-05, -3.7055029721527014E-05, 4.7304527720873781E-05, 1.8771595438202391E-06, -1.5559909809166056E-05, 2.2204932634071048E-06, 1.2088615636834573E-06}; + constexpr FLT c7[] = {2.1206307767331438E-07, -4.5869687934386224E-07, -1.3462277877514610E-06, 4.2970047520315825E-06, -1.1214870287791455E-06, -6.9831974682806686E-06, 6.9831974682536424E-06, 1.1214870287442797E-06, -4.2970047519973843E-06, 1.3462277877581536E-06, 4.5869687934387902E-07, -2.1206307766916990E-07}; + constexpr FLT c8[] = {1.5395324498812247E-08, -1.2022118042082968E-07, 1.5464523856734829E-07, 2.7605497718584214E-07, -8.4964626032687232E-07, 5.2067203458870276E-07, 5.2067203459885582E-07, -8.4964626035081063E-07, 2.7605497714687566E-07, 1.5464523855816190E-07, -1.2022118042113675E-07, 1.5395324502810954E-08}; + constexpr FLT c9[] = {-2.0816585198676747E-09, -6.8192670392830079E-09, 3.6338774645243295E-08, -4.9464520971991661E-08, -1.3242031032754055E-08, 1.0671664853980193E-07, -1.0671664859377656E-07, 1.3242030994003039E-08, 4.9464520976143551E-08, -3.6338774640745405E-08, 6.8192670390883880E-09, 2.0816585232941368E-09}; + constexpr FLT c10[] = {-6.3791929313743717E-10, 1.2240176130963314E-09, 5.3586929812854596E-10, -6.2807355943243368E-09, 1.0600657314581666E-08, -5.5585209501851270E-09, -5.5585209225311209E-09, 1.0600657309553665E-08, -6.2807355855253437E-09, 5.3586928838680225E-10, 1.2240176129588461E-09, -6.3791928984134226E-10}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==13) { - constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07}; - constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06}; - constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06}; - constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06}; - constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06}; - constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06}; - constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07}; - constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08}; - constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08}; - constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10}; - constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10}; - constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11}; + } else if constexpr (w==13) { + constexpr FLT c0[] = {4.4408051211162845E-07, 1.8756193861873413E-04, 6.5146989208011647E-03, 6.8352802598867821E-02, 3.1564238810082473E-01, 7.5353649746793916E-01, 9.9999999999999911E-01, 7.5353649746793805E-01, 3.1564238810082468E-01, 6.8352802598867654E-02, 6.5146989208011647E-03, 1.8756193861873245E-04, 4.4408051211162560E-07}; + constexpr FLT c1[] = {1.9487148068106031E-06, 4.1285069961250658E-04, 9.2995630713278675E-03, 6.5021145064983493E-02, 1.8663042875529987E-01, 2.1451870821533786E-01, -8.4851685343650422E-17, -2.1451870821533786E-01, -1.8663042875529984E-01, -6.5021145064983382E-02, -9.2995630713278675E-03, -4.1285069961250398E-04, -1.9487148068106023E-06}; + constexpr FLT c2[] = {3.7267581324409613E-06, 4.0381251792508718E-04, 5.7019503038218417E-03, 2.4040868593456794E-02, 2.9406233528281669E-02, -2.4394921635639406E-02, -7.0323343245741035E-02, -2.4394921635639197E-02, 2.9406233528281669E-02, 2.4040868593456784E-02, 5.7019503038218391E-03, 4.0381251792508517E-04, 3.7267581324409634E-06}; + constexpr FLT c3[] = {4.1089519307370176E-06, 2.2941839162878732E-04, 1.8941440042457441E-03, 3.5673079836347735E-03, -3.6880489041049131E-03, -1.2074156718545290E-02, 2.5879613397797189E-18, 1.2074156718545309E-02, 3.6880489041048650E-03, -3.5673079836347895E-03, -1.8941440042457428E-03, -2.2941839162878632E-04, -4.1089519307370168E-06}; + constexpr FLT c4[] = {2.9080869014384416E-06, 8.2405696428180852E-05, 3.3386109283452606E-04, -1.7130036080581002E-04, -1.5108662980937166E-03, 7.8665018928577543E-05, 2.3686576883603064E-03, 7.8665018928625479E-05, -1.5108662980936988E-03, -1.7130036080582076E-04, 3.3386109283452714E-04, 8.2405696428180622E-05, 2.9080869014384416E-06}; + constexpr FLT c5[] = {1.3873038503072792E-06, 1.8694798962849680E-05, 1.4885937076473728E-05, -1.3109520271108061E-04, -4.6797213058848897E-05, 3.2555441892417972E-04, -4.6307891541053687E-17, -3.2555441892426857E-04, 4.6797213058873732E-05, 1.3109520271107364E-04, -1.4885937076475722E-05, -1.8694798962849945E-05, -1.3873038503072794E-06}; + constexpr FLT c6[] = {4.5216719173889313E-07, 2.3203195635244226E-06, -6.0547210914045186E-06, -1.2111482379357380E-05, 3.0238388566365154E-05, 1.0632529352020189E-05, -5.0954659549824471E-05, 1.0632529352183524E-05, 3.0238388566276076E-05, -1.2111482379361646E-05, -6.0547210914053648E-06, 2.3203195635247182E-06, 4.5216719173889350E-07}; + constexpr FLT c7[] = {9.7956192761412040E-08, 9.2080334896145121E-09, -1.2031586234320926E-06, 1.3860784486078430E-06, 2.8079238803383842E-06, -5.6034103145900825E-06, -1.8688507553468845E-18, 5.6034103144765343E-06, -2.8079238803764495E-06, -1.3860784486165916E-06, 1.2031586234329267E-06, -9.2080334898328828E-09, -9.7956192761411590E-08}; + constexpr FLT c8[] = {1.2350515865276535E-08, -4.7668301905096242E-08, -3.2637845350574011E-08, 3.2101904614218007E-07, -3.3650826992335862E-07, -3.1117289070051007E-07, 7.8771611533075144E-07, -3.1117289086091704E-07, -3.3650826988353573E-07, 3.2101904611210855E-07, -3.2637845351216048E-08, -4.7668301904904800E-08, 1.2350515865275286E-08}; + constexpr FLT c9[] = {2.7912946705524691E-10, -6.8584366112738666E-09, 1.5876438438624186E-08, 2.2894800340215993E-09, -5.4355139634661042E-08, 6.9215572145029099E-08, -1.8889191980602000E-17, -6.9215572300033164E-08, 5.4355139615285534E-08, -2.2894800243338452E-09, -1.5876438440267642E-08, 6.8584366109440936E-09, -2.7912946705533138E-10}; + constexpr FLT c10[] = {-1.9473100882674590E-10, -6.0076128557162250E-11, 1.8131864338418081E-09, -3.9994904625900363E-09, 2.0334605572691713E-09, 5.0274131302017483E-09, -9.3367592069973016E-09, 5.0274134843614371E-09, 2.0334604856201788E-09, -3.9994904962147793E-09, 1.8131864331347476E-09, -6.0076128257643601E-11, -1.9473100882626639E-10}; + constexpr FLT c11[] = {-2.9813639428048576E-11, 8.8416967288090687E-11, -6.1944898026433027E-11, -2.3424445580726660E-10, 6.6123636775224220E-10, -6.5395825191702606E-10, 1.1351922232346565E-17, 6.5395800007317743E-10, -6.6123637406963074E-10, 2.3424447837951271E-10, 6.1944899055672989E-11, -8.8416967523218366E-11, 2.9813639427979028E-11}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==14) { - constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07}; - constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07}; - constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07}; - constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06}; - constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07}; - constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07}; - constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07}; - constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08}; - constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09}; - constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10}; - constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11}; - constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11}; - constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12}; + } else if constexpr (w==14) { + constexpr FLT c0[] = {1.0213002307223099E-07, 5.7528591418445659E-05, 2.5031206020280070E-03, 3.2405046511689226E-02, 1.8485678142025505E-01, 5.5177865704975260E-01, 9.3670793123951679E-01, 9.3670793123951657E-01, 5.5177865704975282E-01, 1.8485678142025541E-01, 3.2405046511689233E-02, 2.5031206020280157E-03, 5.7528591418445747E-05, 1.0213002307242220E-07}; + constexpr FLT c1[] = {4.6718564624239740E-07, 1.3360375098030145E-04, 3.8410346178215263E-03, 3.4207779106833411E-02, 1.2923501383683475E-01, 2.2132894130184272E-01, 1.2264779624530245E-01, -1.2264779624530252E-01, -2.2132894130184286E-01, -1.2923501383683494E-01, -3.4207779106833397E-02, -3.8410346178215349E-03, -1.3360375098030170E-04, -4.6718564624220221E-07}; + constexpr FLT c2[] = {9.3810713124204495E-07, 1.3926941499858519E-04, 2.5833386162538992E-03, 1.4797516242328836E-02, 3.0361769467151925E-02, 5.7261067343618525E-03, -5.3608938764866769E-02, -5.3608938764866901E-02, 5.7261067343618776E-03, 3.0361769467151856E-02, 1.4797516242328839E-02, 2.5833386162539074E-03, 1.3926941499858541E-04, 9.3810713124224803E-07}; + constexpr FLT c3[] = {1.0954436997682027E-06, 8.5568590196649221E-05, 9.7778250562911557E-04, 3.0692948752812743E-03, 6.0463237460739363E-04, -8.9532302111319344E-03, -7.4040784665310487E-03, 7.4040784665310947E-03, 8.9532302111319205E-03, -6.0463237460742344E-04, -3.0692948752812769E-03, -9.7778250562911883E-04, -8.5568590196649329E-05, -1.0954436997680335E-06}; + constexpr FLT c4[] = {8.3014334976692652E-07, 3.4045323043173907E-05, 2.1660980714121198E-04, 1.7421792587401570E-04, -9.2118064021565335E-04, -9.7597008655077061E-04, 1.4714477548412753E-03, 1.4714477548412239E-03, -9.7597008655075674E-04, -9.2118064021558829E-04, 1.7421792587401792E-04, 2.1660980714121350E-04, 3.4045323043173961E-05, 8.3014334976713193E-07}; + constexpr FLT c5[] = {4.3045614796951603E-07, 8.9716871724549834E-06, 2.3377513570379932E-05, -5.5213296993551241E-05, -1.2391624765756808E-04, 1.5869855385552151E-04, 2.1530382494143235E-04, -2.1530382494149448E-04, -1.5869855385556634E-04, 1.2391624765755097E-04, 5.5213296993546240E-05, -2.3377513570381768E-05, -8.9716871724550257E-06, -4.3045614796933741E-07}; + constexpr FLT c6[] = {1.5611302559652602E-07, 1.4859455506706615E-06, -8.5826557923829850E-07, -1.1616353402592116E-05, 8.0333594878813938E-06, 2.8616079443283100E-05, -2.5816776957767652E-05, -2.5816776957766372E-05, 2.8616079443288531E-05, 8.0333594878547106E-06, -1.1616353402592575E-05, -8.5826557923815948E-07, 1.4859455506706291E-06, 1.5611302559670753E-07}; + constexpr FLT c7[] = {3.9336515129721677E-08, 1.1257285216180971E-07, -6.2406181937583209E-07, -2.6873173854847655E-07, 2.8292088258418352E-06, -1.4598715517537821E-06, -4.0212462691065446E-06, 4.0212462690602856E-06, 1.4598715517232874E-06, -2.8292088259432636E-06, 2.6873173854950209E-07, 6.2406181937597503E-07, -1.1257285216174927E-07, -3.9336515129545587E-08}; + constexpr FLT c8[] = {6.5041263396091917E-09, -9.9149367808629461E-09, -6.6845758889170809E-08, 1.6286641993610469E-07, 5.8507874918728810E-08, -4.7688540980072603E-07, 3.2559878510565369E-07, 3.2559878507638357E-07, -4.7688540979996857E-07, 5.8507875007784286E-08, 1.6286641992908438E-07, -6.6845758890639352E-08, -9.9149367809316500E-09, 6.5041263397792741E-09}; + constexpr FLT c9[] = {5.5138523621066947E-10, -3.4792607432875080E-09, 2.1621109678029579E-09, 1.6802313211955381E-08, -3.4440501485590873E-08, 3.6408052172286002E-09, 5.4274262273949518E-08, -5.4274262358371372E-08, -3.6408052172286002E-09, 3.4440501466215365E-08, -1.6802313212993354E-08, -2.1621109678462066E-09, 3.4792607432848052E-09, -5.5138523606411313E-10}; + constexpr FLT c10[] = {-2.3785683828684552E-11, -2.9453404126079047E-10, 1.0997757891923789E-09, -8.6020468735967815E-10, -2.2974593186348686E-09, 5.5064436653871647E-09, -3.1470907051089416E-09, -3.1470906717984420E-09, 5.5064436660942271E-09, -2.2974593374898691E-09, -8.6020469175917524E-10, 1.0997757873068796E-09, -2.9453404132707798E-10, -2.3785683688860141E-11}; + constexpr FLT c11[] = {-1.2240623323305377E-11, 1.4269095094657965E-11, 6.3689196104514816E-11, -2.3523038822634284E-10, 2.6546833799145628E-10, 9.4137178072301786E-11, -5.6473803862315189E-10, 5.6473798283135854E-10, -9.4137201354365380E-11, -2.6546836658830022E-10, 2.3523038971696211E-10, -6.3689194667129632E-11, -1.4269094983749057E-11, 1.2240623457522869E-11}; + constexpr FLT c12[] = {-1.4791529083698136E-12, 4.8147158489874929E-12, -7.1247151365331094E-12, -3.7363553968273554E-12, 3.0923952752421019E-11, -4.7998353757330678E-11, 2.4268818966431809E-11, 2.4268806716180134E-11, -4.7998370091044259E-11, 3.0923989503259558E-11, -3.7363621599957160E-12, -7.1247176727239074E-12, 4.8147157293535152E-12, -1.4791527916039239E-12}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==15) { - constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08}; - constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07}; - constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07}; - constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07}; - constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07}; - constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07}; - constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08}; - constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08}; - constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09}; - constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10}; - constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11}; - constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12}; - constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13}; + } else if constexpr (w==15) { + constexpr FLT c0[] = {2.3183302143948842E-08, 1.7202745817468652E-05, 9.2668857465754795E-04, 1.4607490553401931E-02, 1.0130044556641114E-01, 3.7041488405244660E-01, 7.8279781886019173E-01, 1.0000000000000009E+00, 7.8279781886019184E-01, 3.7041488405244710E-01, 1.0130044556641132E-01, 1.4607490553401952E-02, 9.2668857465754849E-04, 1.7202745817468621E-05, 2.3183302143948631E-08}; + constexpr FLT c1[] = {1.1019919454791570E-07, 4.1938159428224099E-05, 1.5154850601194960E-03, 1.6839357628952667E-02, 8.0835952724673157E-02, 1.8739074372244086E-01, 1.9255567517255706E-01, -1.2727752801547544E-16, -1.9255567517255720E-01, -1.8739074372244102E-01, -8.0835952724673296E-02, -1.6839357628952695E-02, -1.5154850601194960E-03, -4.1938159428224085E-05, -1.1019919454791562E-07}; + constexpr FLT c2[] = {2.3137327105312783E-07, 4.6266060425611198E-05, 1.1028009511991968E-03, 8.2352859806754733E-03, 2.4233386066663389E-02, 2.2182889945939421E-02, -2.5327411650385150E-02, -6.0946897479642458E-02, -2.5327411650385059E-02, 2.2182889945939335E-02, 2.4233386066663410E-02, 8.2352859806754854E-03, 1.1028009511991972E-03, 4.6266060425611211E-05, 2.3137327105312794E-07}; + constexpr FLT c3[] = {2.8457821671573279E-07, 3.0427184404092306E-05, 4.6337319534911855E-04, 2.1072304367244915E-03, 2.4342755210407414E-03, -4.2814200474568876E-03, -9.6703299158782969E-03, 3.9043785854177514E-17, 9.6703299158782969E-03, 4.2814200474568772E-03, -2.4342755210407176E-03, -2.1072304367244893E-03, -4.6337319534911855E-04, -3.0427184404092299E-05, -2.8457821671573274E-07}; + constexpr FLT c4[] = {2.2919642176438710E-07, 1.3183839322479992E-05, 1.2030953406839326E-04, 2.4905754342428356E-04, -3.4193403196994986E-04, -1.1551611179404832E-03, 2.1954335627552253E-04, 1.7895433812200067E-03, 2.1954335627561873E-04, -1.1551611179404482E-03, -3.4193403196996477E-04, 2.4905754342428507E-04, 1.2030953406839333E-04, 1.3183839322479986E-05, 2.2919642176438702E-07}; + constexpr FLT c5[] = {1.2779800356186581E-07, 3.8997040140349194E-06, 1.8264189394307068E-05, -8.3632912035152412E-06, -1.0687544349168481E-04, 2.2123224043759331E-06, 2.3404180714502455E-04, -8.9283573250376576E-17, -2.3404180714513341E-04, -2.2123224043065501E-06, 1.0687544349165996E-04, 8.3632912035017057E-06, -1.8264189394307319E-05, -3.8997040140349262E-06, -1.2779800356186586E-07}; + constexpr FLT c6[] = {5.0693377499403665E-08, 7.7594237801399526E-07, 9.4933483676662116E-07, -6.6987818302435208E-06, -4.5889941143450991E-06, 2.2647907184643285E-05, 3.7412856034685821E-06, -3.3754692339572014E-05, 3.7412856034970381E-06, 2.2647907184613792E-05, -4.5889941143207909E-06, -6.6987818302368428E-06, 9.4933483676682646E-07, 7.7594237801399970E-07, 5.0693377499403770E-08}; + constexpr FLT c7[] = {1.4373673262756888E-08, 9.2554419735730244E-08, -2.0417866965606358E-07, -6.8820764686333942E-07, 1.4165168644143546E-06, 1.2531774951245720E-06, -3.6383191328704321E-06, -2.0897699832877811E-17, 3.6383191329551320E-06, -1.2531774953500551E-06, -1.4165168644039183E-06, 6.8820764685736064E-07, 2.0417866965590807E-07, -9.2554419735735446E-08, -1.4373673262756828E-08}; + constexpr FLT c8[] = {2.8405432421064975E-09, 2.6648052024114868E-09, -4.5328290134560965E-08, 3.2089634829231624E-08, 1.7241593347641002E-07, -2.5816631652190980E-07, -1.3664009514358830E-07, 4.6017883222989653E-07, -1.3664009512350994E-07, -2.5816631659332659E-07, 1.7241593342393075E-07, 3.2089634833589152E-08, -4.5328290134813698E-08, 2.6648052024069750E-09, 2.8405432421063999E-09}; + constexpr FLT c9[] = {3.5447644664513494E-10, -1.1390658479693891E-09, -2.4324028602392776E-09, 1.2152005526514109E-08, -7.1102518507904692E-09, -2.5878341876005089E-08, 4.0855407136706484E-08, -5.9210554646947549E-17, -4.0855407178225432E-08, 2.5878341947971261E-08, 7.1102518757018364E-09, -1.2152005536547854E-08, 2.4324028600230332E-09, 1.1390658479612798E-09, -3.5447644664519832E-10}; + constexpr FLT c10[] = {1.6106092880537325E-11, -1.9612809867698403E-10, 3.3667881349219601E-10, 5.4740705595869105E-10, -2.3219918283669447E-09, 1.8783264163278577E-09, 2.1531914277141704E-09, -4.8374639764226820E-09, 2.1531914717091667E-09, 1.8783263625911151E-09, -2.3219918478504417E-09, 5.4740705910119197E-10, 3.3667881361003942E-10, -1.9612809867146024E-10, 1.6106092880601503E-11}; + constexpr FLT c11[] = {-2.9809392327917833E-12, -8.3268200023262531E-12, 5.7687950510146423E-11, -9.1929196737224722E-11, -3.9289932155730767E-11, 3.0713727283761530E-10, -3.5332673786726851E-10, -2.2557302724000076E-17, 3.5332680970097469E-10, -3.0713736397839930E-10, 3.9289960867914983E-11, 9.1929193649511812E-11, -5.7687950740837273E-11, 8.3268200012169312E-12, 2.9809392328567880E-12}; + constexpr FLT c12[] = {-6.7275763607221558E-13, 1.4037883829462834E-12, 1.0122749261648481E-12, -1.0507010154740660E-11, 1.9186628665524953E-11, -7.9757821000062307E-12, -2.2999174722289432E-11, 4.0853175824354315E-11, -2.2999195139407817E-11, -7.9759168531799436E-12, 1.9186570476657097E-11, -1.0507008495839712E-11, 1.0122746789244087E-12, 1.4037883764654859E-12, -6.7275763614208155E-13}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==16) { - constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09}; - constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08}; - constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08}; - constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08}; - constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08}; - constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08}; - constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08}; - constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09}; - constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09}; - constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10}; - constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11}; - constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14}; - constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13}; - constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14}; + } else if constexpr (w==16) { + constexpr FLT c0[] = {5.2012152104084216E-09, 5.0291159580938711E-06, 3.3201112337137893E-04, 6.3015433246683310E-03, 5.2427915343763398E-02, 2.3104762006593366E-01, 5.9521037322997195E-01, 9.4441119081353830E-01, 9.4441119081353853E-01, 5.9521037322997183E-01, 2.3104762006593366E-01, 5.2427915343763384E-02, 6.3015433246683319E-03, 3.3201112337137898E-04, 5.0291159580938651E-06, 5.2012152104083678E-09}; + constexpr FLT c1[] = {2.5620581163903688E-08, 1.2815874111792776E-05, 5.7471335914300616E-04, 7.8386860177525469E-03, 4.6638901641906920E-02, 1.3897554029141554E-01, 2.0773808644544120E-01, 1.0813440420918309E-01, -1.0813440420918333E-01, -2.0773808644544134E-01, -1.3897554029141557E-01, -4.6638901641906934E-02, -7.8386860177525487E-03, -5.7471335914300616E-04, -1.2815874111792776E-05, -2.5620581163903665E-08}; + constexpr FLT c2[] = {5.6049296769722354E-08, 1.4879146623074257E-05, 4.4787865139353375E-04, 4.2383440773521705E-03, 1.6624620601556186E-02, 2.6395394769117647E-02, 3.6740117889103615E-04, -4.8088574473126817E-02, -4.8088574473126838E-02, 3.6740117889102304E-04, 2.6395394769117623E-02, 1.6624620601556190E-02, 4.2383440773521705E-03, 4.4787865139353375E-04, 1.4879146623074262E-05, 5.6049296769722407E-08}; + constexpr FLT c3[] = {7.2283166867263396E-08, 1.0391634193778177E-05, 2.0529674430143889E-04, 1.2618687081127943E-03, 2.6256301814801125E-03, -5.5040645592547600E-04, -7.8709464111364428E-03, -5.7657980103486724E-03, 5.7657980103487556E-03, 7.8709464111365469E-03, 5.5040645592550766E-04, -2.6256301814801082E-03, -1.2618687081127936E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263369E-08}; + constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391141E-06, 6.0167136036954428E-05, 2.0573318254802012E-04, 1.2811955521411533E-05, -8.3782209201443785E-04, -6.2669687707138247E-04, 1.1809008871738562E-03, 1.1809008871738887E-03, -6.2669687707137044E-04, -8.3782209201442733E-04, 1.2811955521412302E-05, 2.0573318254801866E-04, 6.0167136036954374E-05, 4.8443034242391115E-06, 6.1501023800531295E-08}; + constexpr FLT c5[] = {3.6571939291734554E-08, 1.5742222553115364E-06, 1.1217451065775625E-05, 1.0668471374316040E-05, -6.0694020243076541E-05, -7.4268888177684097E-05, 1.3567546096376892E-04, 1.4875477215024236E-04, -1.4875477215050417E-04, -1.3567546096388268E-04, 7.4268888177625266E-05, 6.0694020243062074E-05, -1.0668471374318224E-05, -1.1217451065775730E-05, -1.5742222553115388E-06, -3.6571939291734560E-08}; + constexpr FLT c6[] = {1.5672684443241253E-08, 3.5812571134852896E-07, 1.1292168823202123E-06, -2.5215449854190674E-06, -7.6275609266474487E-06, 9.3973092319731781E-06, 1.7891569285056421E-05, -1.8642776809505442E-05, -1.8642776809471432E-05, 1.7891569285033097E-05, 9.3973092319544841E-06, -7.6275609266469057E-06, -2.5215449854189081E-06, 1.1292168823202494E-06, 3.5812571134853351E-07, 1.5672684443241289E-08}; + constexpr FLT c7[] = {4.8970459380161428E-09, 5.4304148291624194E-08, -1.0066736763072382E-08, -5.3239387743769898E-07, 2.2987809872759821E-07, 1.8048974519708698E-06, -1.3449315565748933E-06, -2.4760016204359734E-06, 2.4760016204603713E-06, 1.3449315565910754E-06, -1.8048974519599853E-06, -2.2987809872813671E-07, 5.3239387743785653E-07, 1.0066736763187962E-08, -5.4304148291621945E-08, -4.8970459380161354E-09}; + constexpr FLT c8[] = {1.1055703983904660E-09, 4.3691209554212191E-09, -2.0201061499396576E-08, -2.3275033896957558E-08, 1.2633562932116967E-07, -2.2021804058048667E-08, -2.7912172398215538E-07, 2.1280289570094432E-07, 2.1280289560039370E-07, -2.7912172401993432E-07, -2.2021804058375159E-08, 1.2633562931645146E-07, -2.3275033899474533E-08, -2.0201061499488096E-08, 4.3691209554177036E-09, 1.1055703983904669E-09}; + constexpr FLT c9[] = {1.7210848751138681E-10, -1.3819378018658845E-10, -2.4707116696395422E-09, 4.6626394237380811E-09, 6.2513494800647907E-09, -2.2225751680364230E-08, 7.2716681499015802E-09, 2.9914504847745958E-08, -2.9914504929399881E-08, -7.2716682190998235E-09, 2.2225751651300968E-08, -6.2513494835247027E-09, -4.6626394241705700E-09, 2.4707116695422323E-09, 1.3819378018447668E-10, -1.7210848751140263E-10}; + constexpr FLT c10[] = {1.5548426850855142E-11, -8.2967690041189158E-11, -2.0776280294645870E-11, 6.5818716237227536E-10, -9.7473366701243995E-10, -7.2114134107195186E-10, 2.9974008482669047E-09, -1.8729408099935154E-09, -1.8729408458180050E-09, 2.9974008300895111E-09, -7.2114135112795265E-10, -9.7473366984068834E-10, 6.5818716150808865E-10, -2.0776280304466481E-11, -8.2967690039194425E-11, 1.5548426850879734E-11}; + constexpr FLT c11[] = {1.7715918249318497E-14, -8.7094275472987195E-12, 2.5402078659077626E-11, 5.6643185861776250E-13, -1.1273397189051321E-10, 1.7831200266308735E-10, 2.2125788695759232E-13, -2.7985820464955105E-10, 2.7985821061203020E-10, -2.2125561553264423E-13, -1.7831201299095762E-10, 1.1273397203247594E-10, -5.6643244421784287E-13, -2.5402078579222992E-11, 8.7094275478531680E-12, -1.7715918246051124E-14}; + constexpr FLT c12[] = {-2.1496737416796769E-13, -2.2214973169845399E-14, 2.3291736435064423E-12, -5.9732908832747121E-12, 3.0556773879593299E-12, 1.1858139990185910E-11, -2.4316386830832370E-11, 1.3235593905851255E-11, 1.3235467319571833E-11, -2.4316433790261075E-11, 1.1858129781598945E-11, 3.0556712628198899E-12, -5.9732929568879531E-12, 2.3291735079222313E-12, -2.2214976659196139E-14, -2.1496737416785529E-13}; + constexpr FLT c13[] = {-2.3198933267726957E-14, 8.4680084110917159E-14, -5.5120422834884429E-14, -3.4224862020134189E-13, 1.0093466820639831E-12, -9.9670059417301577E-13, -4.1953332933742669E-13, 2.1120539454092368E-12, -2.1120647488999610E-12, 4.1950459318924021E-13, 9.9668795371011618E-13, -1.0093471667352018E-12, 3.4224875797881982E-13, 5.5120389367396242E-14, -8.4680084697278088E-14, 2.3198933261276072E-14}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h index 630989a26..caa331bae 100644 --- a/include/cufinufft/defs.h +++ b/include/cufinufft/defs.h @@ -6,6 +6,7 @@ // upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for // common #define MAX_NSPREAD 16 +#define MIN_NSPREAD 2 // max number of positive quadr nodes #define MAX_NQUAD 100 diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 9efd094c8..8e35f178d 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -74,8 +74,8 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) template int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth); -template -static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns) +template +static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta) /* ES ("exp sqrt") kernel evaluation at single real argument: phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)), for |x| < nspread/2 related to an asymptotic approximation to the Kaiser--Bessel, itself an @@ -88,9 +88,8 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int : 0.0; } -template -static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, - const double upsampfac) +template +static __device__ void eval_kernel_vec_horner(T *ker, const T x, const double upsampfac) /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. This is the current evaluation method, since it's faster (except i7 w=16). @@ -109,11 +108,11 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, } } -template -static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w, - const T es_c, const T es_beta) { +template +static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const T es_c, + const T es_beta) { for (int i = 0; i < w; i++) { - ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w); + ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta); } } @@ -129,53 +128,53 @@ template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); // Wrappers for methods of spreading template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template +template int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); template int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template +template int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template +template int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); template int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template +template int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t *d_plan, int blksize); template int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template +template int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); template int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template +template int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); template int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template +template int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); // Wrappers for methods of interpolation -template +template int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); -template +template int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); -template +template int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); -template +template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); -template +template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index de01a9ea3..29298e5db 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -10,9 +10,12 @@ #include #include #include +#include // for std::forward #include +#include + #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif @@ -47,18 +50,6 @@ template __forceinline__ __device__ auto interval(const int ns, cons return int2{xstart, xend}; } -// Define a macro to check if NVCC version is >= 11.3 -#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) -#if (__CUDACC_VER_MAJOR__ > 11) || \ - (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) -#define ALLOCA_SUPPORTED 1 -// windows compatibility -#if __has_include() -#include -#endif -#endif -#endif - #if defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 900 #define COMPUTE_CAPABILITY_90_OR_HIGHER 1 @@ -191,6 +182,28 @@ auto set_nhg_type3(T S, T X, const cufinufft_opts &opts, return std::make_tuple(nf, h, gam); } +// Generalized dispatcher for any function requiring ns-based dispatch +template +int dispatch_ns(Func &&func, int target_ns, Args &&...args) { + if constexpr (ns > MAX_NSPREAD) { + return FINUFFT_ERR_METHOD_NOTVALID; // Stop recursion + } else { + if (target_ns == ns) { + return std::forward(func).template operator()( + std::forward(args)...); + } + return dispatch_ns(std::forward(func), target_ns, + std::forward(args)...); + } +} + +// Wrapper function that starts the dispatch recursion +template +int launch_dispatch_ns(Func &&func, int target_ns, Args &&...args) { + return dispatch_ns(std::forward(func), target_ns, + std::forward(args)...); +} + } // namespace utils } // namespace cufinufft diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu index 2bf69f6a2..432a09991 100644 --- a/src/cuda/1d/interp1d_wrapper.cu +++ b/src/cuda/1d/interp1d_wrapper.cu @@ -10,41 +10,46 @@ namespace cufinufft { namespace spreadinterp { -template -int cuinterp1d(cufinufft_plan_t *d_plan, int blksize) -/* - A wrapper for different interpolation methods. - - Methods available: - (1) Non-uniform points driven - (2) Subproblem - - Melody Shih 11/21/21 -*/ -{ - int nf1 = d_plan->nf1; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuinterp1d_nuptsdriven(nf1, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl; - ier = FINUFFT_ERR_METHOD_NOTVALID; +// Functor to handle function selection (nuptsdriven vs subprob) +struct Interp1DDispatcher { + template + int operator()(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) const { + switch (d_plan->opts.gpu_method) { + case 1: + return cuinterp1d_nuptsdriven(nf1, M, d_plan, blksize); + default: + std::cerr << "[cuinterp1d] error: incorrect method, should be 1\n"; + return FINUFFT_ERR_METHOD_NOTVALID; + } } - - return ier; +}; + +// Updated cuinterp1d using generic dispatch +template int cuinterp1d(cufinufft_plan_t *d_plan, int blksize) { + /* + A wrapper for different interpolation methods. + + Methods available: + (1) Non-uniform points driven + (2) Subproblem + + Melody Shih 11/21/21 + + Now the function is updated to dispatch based on ns. This is to avoid alloca which + it seems slower according to the MRI community. + Marco Barbone 01/30/25 + */ + return launch_dispatch_ns(Interp1DDispatcher(), + d_plan->spopts.nspread, d_plan->nf1, + d_plan->M, d_plan, blksize); } -template +template int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; dim3 threadsPerBlock; dim3 blocks; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells T es_c = d_plan->spopts.ES_c; T es_beta = d_plan->spopts.ES_beta; T sigma = d_plan->opts.upsampfac; @@ -61,16 +66,14 @@ int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blks if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { - interp_1d_nuptsdriven<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, - d_idxnupts); + interp_1d_nuptsdriven<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } } else { for (int t = 0; t < blksize; t++) { - interp_1d_nuptsdriven<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, - d_idxnupts); + interp_1d_nuptsdriven<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } } diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 1b2afde7d..f79b6e605 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -20,35 +20,39 @@ using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { -template -int cuspread1d(cufinufft_plan_t *d_plan, int blksize) -/* +// Functor to handle function selection (nuptsdriven vs subprob) +struct Spread1DDispatcher { + template + int operator()(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) const { + switch (d_plan->opts.gpu_method) { + case 1: + return cuspread1d_nuptsdriven(nf1, M, d_plan, blksize); + case 2: + return cuspread1d_subprob(nf1, M, d_plan, blksize); + default: + std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n"; + return FINUFFT_ERR_METHOD_NOTVALID; + } + } +}; + +// Updated cuspread1d using generic dispatch +template int cuspread1d(cufinufft_plan_t *d_plan, int blksize) { + /* A wrapper for different spreading methods. Methods available: - (1) Non-uniform points driven - (2) Subproblem + (1) Non-uniform points driven Melody Shih 11/21/21 -*/ -{ - int nf1 = d_plan->nf1; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuspread1d_nuptsdriven(nf1, M, d_plan, blksize); - } break; - case 2: { - ier = cuspread1d_subprob(nf1, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - return ier; + Now the function is updated to dispatch based on ns. This is to avoid alloca which + it seems slower according to the MRI community. + Marco Barbone 01/30/25 + */ + return launch_dispatch_ns(Spread1DDispatcher(), + d_plan->spopts.nspread, d_plan->nf1, + d_plan->M, d_plan, blksize); } template struct cmp : public thrust::binary_function { @@ -117,13 +121,12 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { return 0; } -template +template int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; dim3 threadsPerBlock; dim3 blocks; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells int *d_idxnupts = d_plan->idxnupts; T es_c = d_plan->spopts.ES_c; T es_beta = d_plan->spopts.ES_beta; @@ -140,16 +143,14 @@ int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blks if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { - spread_1d_nuptsdriven<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, - d_idxnupts); + spread_1d_nuptsdriven<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } } else { for (int t = 0; t < blksize; t++) { - spread_1d_nuptsdriven<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, - d_idxnupts); + spread_1d_nuptsdriven<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } } @@ -233,13 +234,11 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) return 0; } -template +template int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; + auto &stream = d_plan->stream; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; // assume that bin_size_x > ns/2; @@ -266,30 +265,27 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } for (int t = 0; t < blksize; t++) { - - if (const auto finufft_err = - cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan) != 0) { - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } - RETURN_IF_CUDA_ERROR - spread_1d_subprob<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, - d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, - d_numsubprob, maxsubprobsize, numbins, d_idxnupts); + spread_1d_subprob<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_binstartpts, + d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, + maxsubprobsize, numbins, d_idxnupts); RETURN_IF_CUDA_ERROR } } else { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } for (int t = 0; t < blksize; t++) { - if (const auto finufft_err = - cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan) != 0) { - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } - RETURN_IF_CUDA_ERROR - spread_1d_subprob<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, - d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, - d_numsubprob, maxsubprobsize, numbins, d_idxnupts); + spread_1d_subprob<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, nf1, es_c, es_beta, sigma, d_binstartpts, + d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, + maxsubprobsize, numbins, d_idxnupts); RETURN_IF_CUDA_ERROR } } diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 72c776c06..139879430 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -18,18 +18,11 @@ namespace spreadinterp { /* ------------------------ 1d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template +template __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, T es_c, + cuda_complex *fw, int M, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - // dynamic stack allocation to reduce stack usage -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns); - auto *__restrict__ ker1 = ker; -#else - T ker1[MAX_NSPREAD]; -#endif - + T ker1[ns]; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -37,9 +30,9 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, const auto [xstart, xend] = interval(ns, x_rescaled); const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); for (auto xx = xstart; xx <= xend; xx++) { auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); @@ -89,12 +82,12 @@ __global__ void calc_inverse_of_global_sort_idx_1d( } } -template +template __global__ void spread_1d_subprob( - const T *x, const cuda_complex *c, cuda_complex *fw, int M, uint8_t ns, int nf1, - T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, - int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int *idxnupts) { + const T *x, const cuda_complex *c, cuda_complex *fw, int M, int nf1, T es_c, + T es_beta, T sigma, const int *binstartpts, const int *bin_size, int bin_size_x, + const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, + int maxsubprobsize, int nbinx, int *idxnupts) { extern __shared__ char sharedbuf[]; auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; @@ -107,13 +100,7 @@ __global__ void spread_1d_subprob( const auto ns_2 = (ns + 1) / 2; const int N = bin_size_x + 2 * ns_2; - // dynamic stack allocation -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns); - auto *__restrict__ ker1 = ker; -#else - T ker1[MAX_NSPREAD]; -#endif + T ker1[ns]; for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; @@ -130,9 +117,9 @@ __global__ void spread_1d_subprob( const auto [xstart, xend] = interval(ns, x_rescaled); const T x1 = T(xstart + xoffset) - x_rescaled; if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); for (int xx = xstart; xx <= xend; xx++) { const auto ix = xx + ns_2; if (ix >= (bin_size_x + ns_2) || ix < 0) break; @@ -154,17 +141,13 @@ __global__ void spread_1d_subprob( /* --------------------- 1d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template +template __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, - const cuda_complex *fw, int M, int ns, int nf1, - T es_c, T es_beta, T sigma, const int *idxnupts) { - // dynamic stack allocation -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns); - auto *__restrict__ ker1 = ker; -#else - T ker1[MAX_NSPREAD]; -#endif + const cuda_complex *fw, int M, int nf1, T es_c, + T es_beta, T sigma, const int *idxnupts) { + + T ker1[ns]; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -174,9 +157,9 @@ __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); for (int xx = xstart; xx <= xend; xx++) { int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); const T kervalue1 = ker1[xx - xstart]; diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index 0d3d3ff9b..13c287c5a 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -13,39 +13,44 @@ using namespace cufinufft::common; namespace cufinufft { namespace spreadinterp { -template -int cuinterp2d(cufinufft_plan_t *d_plan, int blksize) -/* +// Functor to handle function selection (nuptsdriven vs subprob) +struct Interp2DDispatcher { + template + int operator()(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, + int blksize) const { + switch (d_plan->opts.gpu_method) { + case 1: + return cuinterp2d_nuptsdriven(nf1, nf2, M, d_plan, blksize); + case 2: + return cuinterp2d_subprob(nf1, nf2, M, d_plan, blksize); + default: + std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n"; + return FINUFFT_ERR_METHOD_NOTVALID; + } + } +}; + +// Updated cuinterp2d using generic dispatch +template int cuinterp2d(cufinufft_plan_t *d_plan, int blksize) { + /* A wrapper for different interpolation methods. Methods available: - (1) Non-uniform points driven - (2) Subproblem + (1) Non-uniform points driven + (2) Subproblem Melody Shih 07/25/19 -*/ -{ - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuinterp2d_nuptsdriven(nf1, nf2, M, d_plan, blksize); - } break; - case 2: { - ier = cuinterp2d_subprob(nf1, nf2, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - return ier; + Now the function is updated to dispatch based on ns. This is to avoid alloca which + it seems slower according to the MRI community. + Marco Barbone 01/30/25 + */ + return launch_dispatch_ns( + Interp2DDispatcher(), d_plan->spopts.nspread, d_plan->nf1, d_plan->nf2, d_plan->M, + d_plan, blksize); } -template +template int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; @@ -53,7 +58,6 @@ int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, dim3 threadsPerBlock; dim3 blocks; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells T es_c = d_plan->spopts.ES_c; T es_beta = d_plan->spopts.ES_beta; T sigma = d_plan->opts.upsampfac; @@ -72,15 +76,15 @@ int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { - interp_2d_nupts_driven<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + interp_2d_nupts_driven<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, nf1, nf2, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } } else { for (int t = 0; t < blksize; t++) { - interp_2d_nupts_driven<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + interp_2d_nupts_driven<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, nf1, nf2, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } @@ -89,14 +93,13 @@ int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, return 0; } -template +template int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; // assume that bin_size_x > ns/2; @@ -125,20 +128,26 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { + if (const auto finufft_err = + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan)) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } for (int t = 0; t < blksize; t++) { - cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); - interp_2d_subprob<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + interp_2d_subprob<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], d_idxnupts); RETURN_IF_CUDA_ERROR } } else { + if (const auto finufft_err = + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan)) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } for (int t = 0; t < blksize; t++) { - cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); - interp_2d_subprob<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + interp_2d_subprob<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], d_idxnupts); diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 490c8eed1..564ac4847 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -18,36 +18,41 @@ using namespace cufinufft::common; namespace cufinufft { namespace spreadinterp { -template -int cuspread2d(cufinufft_plan_t *d_plan, int blksize) -/* +// Functor to handle function selection (nuptsdriven vs subprob) +struct Spread2DDispatcher { + template + int operator()(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, + int blksize) const { + switch (d_plan->opts.gpu_method) { + case 1: + return cuspread2d_nuptsdriven(nf1, nf2, M, d_plan, blksize); + case 2: + return cuspread2d_subprob(nf1, nf2, M, d_plan, blksize); + default: + std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n"; + return FINUFFT_ERR_METHOD_NOTVALID; + } + } +}; + +// Updated cuspread2d using generic dispatch +template int cuspread2d(cufinufft_plan_t *d_plan, int blksize) { + /* A wrapper for different spreading methods. Methods available: - (1) Non-uniform points driven - (2) Subproblem + (1) Non-uniform points driven + (2) Subproblem Melody Shih 07/25/19 -*/ -{ - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuspread2d_nuptsdriven(nf1, nf2, M, d_plan, blksize); - } break; - case 2: { - ier = cuspread2d_subprob(nf1, nf2, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - return ier; + Now the function is updated to dispatch based on ns. This is to avoid alloca which + it seems slower according to the MRI community. + Marco Barbone 01/30/25 + */ + return launch_dispatch_ns( + Spread2DDispatcher(), d_plan->spopts.nspread, d_plan->nf1, d_plan->nf2, d_plan->M, + d_plan, blksize); } template @@ -104,14 +109,13 @@ int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_ return 0; } -template +template int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; dim3 threadsPerBlock; dim3 blocks; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells int *d_idxnupts = d_plan->idxnupts; T es_c = d_plan->spopts.ES_c; T es_beta = d_plan->spopts.ES_beta; @@ -128,15 +132,15 @@ int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, blocks.y = 1; if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { - spread_2d_nupts_driven<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + spread_2d_nupts_driven<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, nf1, nf2, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } } else { for (int t = 0; t < blksize; t++) { - spread_2d_nupts_driven<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + spread_2d_nupts_driven<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, nf1, nf2, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } @@ -237,14 +241,13 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan return 0; } -template +template int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; // assume that bin_size_x > ns/2; @@ -275,28 +278,26 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } for (int t = 0; t < blksize; t++) { - if (const auto finufft_err = - cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan) != 0) { - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } - RETURN_IF_CUDA_ERROR - spread_2d_subprob<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + spread_2d_subprob<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], d_idxnupts); RETURN_IF_CUDA_ERROR } } else { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } for (int t = 0; t < blksize; t++) { - if (const auto finufft_err = - cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan) != 0) { - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } - RETURN_IF_CUDA_ERROR - spread_2d_subprob<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + spread_2d_subprob<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], d_idxnupts); diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 805e921aa..24b4b56ce 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -15,18 +15,12 @@ namespace spreadinterp { /* ------------------------ 2d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void spread_2d_nupts_driven( - const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 2); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; -#endif +template +__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex *c, + cuda_complex *fw, int M, int nf1, int nf2, + T es_c, T es_beta, T sigma, const int *idxnupts) { + T ker1[ns]; + T ker2[ns]; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -39,11 +33,11 @@ __global__ void spread_2d_nupts_driven( const auto y1 = (T)ystart - y_rescaled; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + eval_kernel_vec_horner(ker2, y1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); } for (auto yy = ystart; yy <= yend; yy++) { @@ -112,10 +106,10 @@ __global__ void calc_inverse_of_global_sort_index_2d( } } -template +template __global__ void spread_2d_subprob( - const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int nf1, + int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts) { @@ -136,14 +130,8 @@ __global__ void spread_2d_subprob( const auto rounded_ns = ns_2 * 2; const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 2); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; -#endif + T ker1[ns]; + T ker2[ns]; for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; @@ -166,11 +154,11 @@ __global__ void spread_2d_subprob( yend -= yoffset; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + eval_kernel_vec_horner(ker2, y1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); } for (int yy = ystart; yy <= yend; yy++) { @@ -206,18 +194,12 @@ __global__ void spread_2d_subprob( /* --------------------- 2d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_2d_nupts_driven( - const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 2); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; -#endif +template +__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex *c, + const cuda_complex *fw, int M, int nf1, int nf2, + T es_c, T es_beta, T sigma, const int *idxnupts) { + T ker1[ns]; + T ker2[ns]; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { @@ -230,11 +212,11 @@ __global__ void interp_2d_nupts_driven( T y1 = (T)ystart - y_rescaled; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + eval_kernel_vec_horner(ker2, y1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); } cuda_complex cnow{0, 0}; @@ -254,24 +236,18 @@ __global__ void interp_2d_nupts_driven( } /* Kernels for Subprob Method */ -template +template __global__ void interp_2d_subprob( - const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int nf1, + int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts) { extern __shared__ char sharedbuf[]; cuda_complex *fwshared = (cuda_complex *)sharedbuf; -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 2); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; -#endif + T ker1[ns]; + T ker2[ns]; const auto subpidx = blockIdx.x; const auto bidx = subprob_to_bin[subpidx]; @@ -319,11 +295,11 @@ __global__ void interp_2d_subprob( yend -= yoffset; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + eval_kernel_vec_horner(ker2, y1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); } for (int yy = ystart; yy <= yend; yy++) { diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu index 51c620756..f65599676 100644 --- a/src/cuda/3d/interp3d_wrapper.cu +++ b/src/cuda/3d/interp3d_wrapper.cu @@ -14,45 +14,48 @@ using namespace cufinufft::common; namespace cufinufft { namespace spreadinterp { -template -int cuinterp3d(cufinufft_plan_t *d_plan, int blksize) -/* +// Functor to handle function selection (nuptsdriven vs subprob) +struct Interp3DDispatcher { + template + int operator()(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize) const { + switch (d_plan->opts.gpu_method) { + case 1: + return cuinterp3d_nuptsdriven(nf1, nf2, nf3, M, d_plan, blksize); + case 2: + return cuinterp3d_subprob(nf1, nf2, nf3, M, d_plan, blksize); + default: + std::cerr << "[cuinterp3d] error: incorrect method, should be 1 or 2\n"; + return FINUFFT_ERR_METHOD_NOTVALID; + } + } +}; + +// Updated cuinterp3d using generic dispatch +template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize) { + /* A wrapper for different interpolation methods. Methods available: - (1) Non-uniform points driven - (2) Subproblem + (1) Non-uniform points driven + (2) Subproblem Melody Shih 07/25/19 -*/ -{ - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int nf3 = d_plan->nf3; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuinterp3d_nuptsdriven(nf1, nf2, nf3, M, d_plan, blksize); - } break; - case 2: { - ier = cuinterp3d_subprob(nf1, nf2, nf3, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuinterp3d] error: incorrect method, should be 1,2\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - return ier; + Now the function is updated to dispatch based on ns. This is to avoid alloca which + it seems slower according to the MRI community. + Marco Barbone 01/30/25 + */ + return launch_dispatch_ns( + Interp3DDispatcher(), d_plan->spopts.nspread, d_plan->nf1, d_plan->nf2, d_plan->nf3, + d_plan->M, d_plan, blksize); } -template +template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { const auto stream = d_plan->stream; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells T es_c = d_plan->spopts.ES_c; T es_beta = d_plan->spopts.ES_beta; T sigma = d_plan->spopts.upsampfac; @@ -70,15 +73,15 @@ int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { - interp_3d_nupts_driven<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + interp_3d_nupts_driven<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } } else { for (int t = 0; t < blksize; t++) { - interp_3d_nupts_driven<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + interp_3d_nupts_driven<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } @@ -87,12 +90,11 @@ int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t return 0; } -template +template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; // assume that bin_size_x > ns/2; @@ -125,19 +127,27 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ shared_memory_required(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - for (int t = 0; t < blksize; t++) { - if (d_plan->opts.gpu_kerevalmeth == 1) { - cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan); - interp_3d_subprob<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + if (d_plan->opts.gpu_kerevalmeth == 1) { + if (const auto finufft_err = + cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan)) { + return finufft_err; + } + for (int t = 0; t < blksize; t++) { + interp_3d_subprob<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR - } else { - cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan); - interp_3d_subprob<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + } + } else { + if (const auto finufft_err = + cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan)) { + return finufft_err; + } + for (int t = 0; t < blksize; t++) { + interp_3d_subprob<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts); @@ -151,15 +161,5 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, - cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_nuptsdriven( - int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); - -template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, - cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, - cufinufft_plan_t *d_plan, int blksize); - } // namespace spreadinterp } // namespace cufinufft diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index a0411c2b1..e48dba837 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -18,41 +18,44 @@ using namespace cufinufft::common; namespace cufinufft { namespace spreadinterp { -template -int cuspread3d(cufinufft_plan_t *d_plan, int blksize) -/* +// Functor to handle function selection (nuptsdriven, subprob, blockgather) +struct Spread3DDispatcher { + template + int operator()(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize) const { + switch (d_plan->opts.gpu_method) { + case 1: + return cuspread3d_nuptsdriven(nf1, nf2, nf3, M, d_plan, blksize); + case 2: + return cuspread3d_subprob(nf1, nf2, nf3, M, d_plan, blksize); + case 4: + return cuspread3d_blockgather(nf1, nf2, nf3, M, d_plan, blksize); + default: + std::cerr << "[cuspread3d] error: incorrect method, should be 1, 2, or 4\n"; + return FINUFFT_ERR_METHOD_NOTVALID; + } + } +}; + +// Updated cuspread3d using generic dispatch +template int cuspread3d(cufinufft_plan_t *d_plan, int blksize) { + /* A wrapper for different spreading methods. Methods available: - (1) Non-uniform points driven - (2) Subproblem - (4) Block gather + (1) Non-uniform points driven + (2) Subproblem + (4) Block gather Melody Shih 07/25/19 -*/ -{ - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int nf3 = d_plan->nf3; - int M = d_plan->M; - - int ier = 0; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuspread3d_nuptsdriven(nf1, nf2, nf3, M, d_plan, blksize); - } break; - case 2: { - ier = cuspread3d_subprob(nf1, nf2, nf3, M, d_plan, blksize); - } break; - case 4: { - ier = cuspread3d_blockgather(nf1, nf2, nf3, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuspread3d] error: incorrect method, should be 1,2,4" << std::endl; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - return ier; + Now the function is updated to dispatch based on ns. This is to avoid alloca which + it seems slower according to the MRI community. + Marco Barbone 01/30/25 + */ + return launch_dispatch_ns( + Spread3DDispatcher(), d_plan->spopts.nspread, d_plan->nf1, d_plan->nf2, d_plan->nf3, + d_plan->M, d_plan, blksize); } template @@ -113,7 +116,7 @@ int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, return 0; } -template +template int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; @@ -121,7 +124,6 @@ int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t dim3 threadsPerBlock; dim3 blocks; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells T sigma = d_plan->spopts.upsampfac; T es_c = d_plan->spopts.ES_c; T es_beta = d_plan->spopts.ES_beta; @@ -140,15 +142,15 @@ int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t if (d_plan->opts.gpu_kerevalmeth == 1) { for (int t = 0; t < blksize; t++) { - spread_3d_nupts_driven<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + spread_3d_nupts_driven<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } } else { for (int t = 0; t < blksize; t++) { - spread_3d_nupts_driven<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + spread_3d_nupts_driven<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts); RETURN_IF_CUDA_ERROR } @@ -338,12 +340,11 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, return 0; } -template +template int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; - int ns = d_plan->spopts.nspread; T es_c = d_plan->spopts.ES_c; T es_beta = d_plan->spopts.ES_beta; T sigma = d_plan->spopts.upsampfac; @@ -387,18 +388,22 @@ int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth == 1) { - spread_3d_block_gather<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, - es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z, - binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, d_subprobstartpts, - maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts); + spread_3d_block_gather + <<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, + es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z, + binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, + d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], + d_idxnupts); RETURN_IF_CUDA_ERROR } else { - spread_3d_block_gather<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, - es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z, - binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, d_subprobstartpts, - maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts); + spread_3d_block_gather + <<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, + es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z, + binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, + d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], + d_idxnupts); RETURN_IF_CUDA_ERROR } } @@ -496,12 +501,11 @@ int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, return 0; } -template +template int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; // assume that bin_size_x > ns/2; @@ -534,27 +538,27 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ const auto sharedplanorysize = shared_memory_required(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - for (int t = 0; t < blksize; t++) { - if (d_plan->opts.gpu_kerevalmeth) { - if (const auto finufft_err = - cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan) != 0) { - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } - RETURN_IF_CUDA_ERROR - spread_3d_subprob<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + if (d_plan->opts.gpu_kerevalmeth) { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + for (int t = 0; t < blksize; t++) { + spread_3d_subprob<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR - } else { - if (const auto finufft_err = - cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan) != 0) { - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } - RETURN_IF_CUDA_ERROR - spread_3d_subprob<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + } + } else { + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + for (int t = 0; t < blksize; t++) { + spread_3d_subprob<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts); diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index 298ae4a43..fdfea4033 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -77,21 +77,13 @@ __global__ void calc_inverse_of_global_sort_index_3d( } /* Kernels for NUptsdriven method */ -template -__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, - const cuda_complex *c, cuda_complex *fw, - int M, int ns, int nf1, int nf2, int nf3, T es_c, - T es_beta, T sigma, const int *idxnupts) { -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; - auto *__restrict__ ker3 = ker + ns + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; -#endif +template +__global__ void spread_3d_nupts_driven( + const T *x, const T *y, const T *z, const cuda_complex *c, cuda_complex *fw, + int M, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, const int *idxnupts) { + T ker1[ns]; + T ker2[ns]; + T ker3[ns]; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -107,13 +99,13 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const auto z1 = T(zstart) - z_rescaled; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - eval_kernel_vec_horner(ker3, z1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + eval_kernel_vec_horner(ker2, y1, sigma); + eval_kernel_vec_horner(ker3, z1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); + eval_kernel_vec(ker3, z1, es_c, es_beta); } for (int zz = zstart; zz <= zend; zz++) { @@ -137,13 +129,12 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, } /* Kernels for Subprob method */ -template +template __global__ void spread_3d_subprob( - T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, - int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin, - int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, - int nbinz, int *idxnupts) { + T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int nf1, int nf2, + int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, int bin_size_x, + int bin_size_y, int bin_size_z, int *subprob_to_bin, int *subprobstartpts, + int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, int *idxnupts) { extern __shared__ char sharedbuf[]; auto fwshared = (cuda_complex *)sharedbuf; @@ -167,16 +158,10 @@ __global__ void spread_3d_subprob( fwshared[i] = {0, 0}; } __syncthreads(); -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; - auto *__restrict__ ker3 = ker + ns + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; -#endif + + T ker1[ns]; + T ker2[ns]; + T ker3[ns]; for (int i = threadIdx.x; i < nupts; i += blockDim.x) { const int nuptsidx = idxnupts[ptstart + i]; @@ -201,13 +186,13 @@ __global__ void spread_3d_subprob( zend -= zoffset; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - eval_kernel_vec_horner(ker3, z1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + eval_kernel_vec_horner(ker2, y1, sigma); + eval_kernel_vec_horner(ker3, z1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); + eval_kernel_vec(ker3, z1, es_c, es_beta); } for (int zz = zstart; zz <= zend; zz++) { @@ -310,13 +295,13 @@ __global__ void calc_inverse_of_global_sort_index_ghost( } } -template +template __global__ void spread_3d_block_gather( const T *x, const T *y, const T *z, const cuda_complex *c, cuda_complex *fw, - int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, - const int *binstartpts, int obin_size_x, int obin_size_y, int obin_size_z, - int binsperobin, int *subprob_to_bin, const int *subprobstartpts, int maxsubprobsize, - int nobinx, int nobiny, int nobinz, const int *idxnupts) { + int M, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, const int *binstartpts, + int obin_size_x, int obin_size_y, int obin_size_z, int binsperobin, + int *subprob_to_bin, const int *subprobstartpts, int maxsubprobsize, int nobinx, + int nobiny, int nobinz, const int *idxnupts) { extern __shared__ char sharedbuf[]; cuda_complex *fwshared = (cuda_complex *)sharedbuf; const int subpidx = blockIdx.x; @@ -335,16 +320,10 @@ __global__ void spread_3d_block_gather( const int N = obin_size_x * obin_size_y * obin_size_z; -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; - auto *__restrict__ ker3 = ker + ns + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; -#endif + T ker1[ns]; + T ker2[ns]; + T ker3[ns]; + for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; } @@ -383,13 +362,13 @@ __global__ void spread_3d_block_gather( zend -= zoffset; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - eval_kernel_vec_horner(ker3, z1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + eval_kernel_vec_horner(ker2, y1, sigma); + eval_kernel_vec_horner(ker3, z1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); + eval_kernel_vec(ker3, z1, es_c, es_beta); } const auto xstartnew = xstart < 0 ? 0 : xstart; @@ -431,20 +410,15 @@ __global__ void spread_3d_block_gather( /* ---------------------- 3d Interpolation Kernels ---------------------------*/ /* Kernels for NUptsdriven Method */ -template +template __global__ void interp_3d_nupts_driven( const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, - int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts) { -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; - auto *__restrict__ ker3 = ker + ns + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; -#endif + int M, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts) { + + T ker1[ns]; + T ker2[ns]; + T ker3[ns]; + cuda_complex cnow{}; for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { @@ -466,13 +440,13 @@ __global__ void interp_3d_nupts_driven( cnow.y = T(0); if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - eval_kernel_vec_horner(ker3, z1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + eval_kernel_vec_horner(ker2, y1, sigma); + eval_kernel_vec_horner(ker3, z1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); + eval_kernel_vec(ker3, z1, es_c, es_beta); } for (int zz = zstart; zz <= zend; zz++) { @@ -495,27 +469,19 @@ __global__ void interp_3d_nupts_driven( } /* Kernels for SubProb Method */ -template +template __global__ void interp_3d_subprob( const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, - int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, - const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, - int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, - const int *idxnupts) { + int M, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, const int *binstartpts, + const int *bin_size, int bin_size_x, int bin_size_y, int bin_size_z, + const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, + int maxsubprobsize, int nbinx, int nbiny, int nbinz, const int *idxnupts) { extern __shared__ char sharedbuf[]; auto fwshared = (cuda_complex *)sharedbuf; -#if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); - auto *__restrict__ ker1 = ker; - auto *__restrict__ ker2 = ker + ns; - auto *__restrict__ ker3 = ker + ns + ns; -#else - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; -#endif + T ker1[ns]; + T ker2[ns]; + T ker3[ns]; const auto subpidx = blockIdx.x; const auto bidx = subprob_to_bin[subpidx]; @@ -577,13 +543,14 @@ __global__ void interp_3d_subprob( zend -= zoffset; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - eval_kernel_vec_horner(ker3, z1, ns, sigma); + eval_kernel_vec_horner(ker1, x1, sigma); + + eval_kernel_vec_horner(ker2, y1, sigma); + eval_kernel_vec_horner(ker3, z1, sigma); } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, es_c, es_beta); + eval_kernel_vec(ker2, y1, es_c, es_beta); + eval_kernel_vec(ker3, z1, es_c, es_beta); } for (int zz = zstart; zz <= zend; zz++) {