18 #if defined(_WIN32) || defined(__i386__) 
   19 #define BT_USE_SSE_IN_API 
   24 #if defined BT_USE_SIMD_VECTOR3 
   32 typedef float float4 __attribute__((vector_size(16)));
 
   38 #if defined BT_USE_SSE || defined _WIN32 
   40 #define LOG2_ARRAY_SIZE 6 
   41 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE) 
   43 #include <emmintrin.h> 
   45 long _maxdot_large(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult);
 
   46 long _maxdot_large(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult)
 
   48         const float4 *vertices = (
const float4 *)vv;
 
   49         static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
 
   51         float4 vvec = _mm_loadu_ps(vec);
 
   52         float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));  
 
   53         float4 vLo = _mm_movelh_ps(vvec, vvec);                                    
 
   58         float4 stack_array[STACK_ARRAY_COUNT];
 
   67         for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
 
   71                 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
 
   73                         float4 v0 = vertices[0];
 
   74                         float4 v1 = vertices[1];
 
   75                         float4 v2 = vertices[2];
 
   76                         float4 v3 = vertices[3];
 
   79                         float4 lo0 = _mm_movelh_ps(v0, v1);  
 
   80                         float4 hi0 = _mm_movehl_ps(v1, v0);  
 
   81                         float4 lo1 = _mm_movelh_ps(v2, v3);  
 
   82                         float4 hi1 = _mm_movehl_ps(v3, v2);  
 
   86                         float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
   87                         float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
   88                         float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
   92                         stack_array[index] = x;
 
   93                         max = _mm_max_ps(x, max);  
 
  101                         lo0 = _mm_movelh_ps(v0, v1);  
 
  102                         hi0 = _mm_movehl_ps(v1, v0);  
 
  103                         lo1 = _mm_movelh_ps(v2, v3);  
 
  104                         hi1 = _mm_movehl_ps(v3, v2);  
 
  108                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  109                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  110                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  114                         stack_array[index + 1] = x;
 
  115                         max = _mm_max_ps(x, max);  
 
  123                         lo0 = _mm_movelh_ps(v0, v1);  
 
  124                         hi0 = _mm_movehl_ps(v1, v0);  
 
  125                         lo1 = _mm_movelh_ps(v2, v3);  
 
  126                         hi1 = _mm_movehl_ps(v3, v2);  
 
  130                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  131                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  132                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  136                         stack_array[index + 2] = x;
 
  137                         max = _mm_max_ps(x, max);  
 
  145                         lo0 = _mm_movelh_ps(v0, v1);  
 
  146                         hi0 = _mm_movehl_ps(v1, v0);  
 
  147                         lo1 = _mm_movelh_ps(v2, v3);  
 
  148                         hi1 = _mm_movehl_ps(v3, v2);  
 
  152                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  153                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  154                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  158                         stack_array[index + 3] = x;
 
  159                         max = _mm_max_ps(x, max);  
 
  165                 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
 
  168                         max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
 
  169                         max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
 
  175                         for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  
 
  179                         maxIndex = 4 * index + segment + indexTable[test];
 
  192                 for (; index + 4 <= count / 4; index += 4)
 
  194                         float4 v0 = vertices[0];
 
  195                         float4 v1 = vertices[1];
 
  196                         float4 v2 = vertices[2];
 
  197                         float4 v3 = vertices[3];
 
  200                         float4 lo0 = _mm_movelh_ps(v0, v1);  
 
  201                         float4 hi0 = _mm_movehl_ps(v1, v0);  
 
  202                         float4 lo1 = _mm_movelh_ps(v2, v3);  
 
  203                         float4 hi1 = _mm_movehl_ps(v3, v2);  
 
  207                         float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  208                         float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  209                         float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  213                         stack_array[index] = x;
 
  214                         max = _mm_max_ps(x, max);  
 
  222                         lo0 = _mm_movelh_ps(v0, v1);  
 
  223                         hi0 = _mm_movehl_ps(v1, v0);  
 
  224                         lo1 = _mm_movelh_ps(v2, v3);  
 
  225                         hi1 = _mm_movehl_ps(v3, v2);  
 
  229                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  230                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  231                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  235                         stack_array[index + 1] = x;
 
  236                         max = _mm_max_ps(x, max);  
 
  244                         lo0 = _mm_movelh_ps(v0, v1);  
 
  245                         hi0 = _mm_movehl_ps(v1, v0);  
 
  246                         lo1 = _mm_movelh_ps(v2, v3);  
 
  247                         hi1 = _mm_movehl_ps(v3, v2);  
 
  251                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  252                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  253                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  257                         stack_array[index + 2] = x;
 
  258                         max = _mm_max_ps(x, max);  
 
  266                         lo0 = _mm_movelh_ps(v0, v1);  
 
  267                         hi0 = _mm_movehl_ps(v1, v0);  
 
  268                         lo1 = _mm_movelh_ps(v2, v3);  
 
  269                         hi1 = _mm_movehl_ps(v3, v2);  
 
  273                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  274                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  275                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  279                         stack_array[index + 3] = x;
 
  280                         max = _mm_max_ps(x, max);  
 
  286         size_t localCount = (count & -4L) - 4 * index;
 
  290                 float4 t0, t1, t2, t3, t4;
 
  291                 float4 *sap = &stack_array[index + localCount / 4];
 
  292                 vertices += localCount;  
 
  293                 size_t byteIndex = -(localCount) * 
sizeof(
float);
 
  297              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\ 
  298           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\ 
  299           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\ 
  300           movaps  %[t0], %[max]                               // vertices[0]      \n\ 
  301           movlhps %[t1], %[max]                               // x0y0x1y1         \n\ 
  302          movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\ 
  303          movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\ 
  304           mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\ 
  305          movhlps %[t0], %[t1]                                // z0w0z1w1         \n\ 
  306          movaps  %[t3], %[t0]                                // vertices[2]      \n\ 
  307          movlhps %[t4], %[t0]                                // x2y2x3y3         \n\ 
  308          mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\ 
  309           movhlps %[t3], %[t4]                                // z2w2z3w3         \n\ 
  310           shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\ 
  311           mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\ 
  312          movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\ 
  313          shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\ 
  314          shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\ 
  315          addps   %[t3], %[max]                               // x + y            \n\ 
  316          addps   %[t1], %[max]                               // x + y + z        \n\ 
  317          movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\ 
  318          maxps   %[t2], %[max]                               // record max, restore max   \n\ 
  319          add     $16, %[byteIndex]                           // advance loop counter\n\ 
  322                         : [max] 
"+x"(max), [t0] 
"=&x"(t0), [t1] 
"=&x"(t1), [t2] 
"=&x"(t2), [t3] 
"=&x"(t3), [t4] 
"=&x"(t4), [byteIndex] 
"+r"(byteIndex)
 
  323                         : [vLo] 
"x"(vLo), [vHi] 
"x"(vHi), [vertices] 
"r"(vertices), [sap] 
"r"(sap)
 
  325                 index += localCount / 4;
 
  328                         for (
unsigned int i = 0; i < localCount / 4; i++, index++)
 
  330                                 float4 v0 = vertices[0];
 
  331                                 float4 v1 = vertices[1];
 
  332                                 float4 v2 = vertices[2];
 
  333                                 float4 v3 = vertices[3];
 
  336                                 float4 lo0 = _mm_movelh_ps(v0, v1);  
 
  337                                 float4 hi0 = _mm_movehl_ps(v1, v0);  
 
  338                                 float4 lo1 = _mm_movelh_ps(v2, v3);  
 
  339                                 float4 hi1 = _mm_movehl_ps(v3, v2);  
 
  343                                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  344                                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  345                                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  349                                 stack_array[index] = x;
 
  350                                 max = _mm_max_ps(x, max);  
 
  359                 float4 v0, v1, v2, x, y, z;
 
  369                                 float4 lo0 = _mm_movelh_ps(v0, v1);  
 
  370                                 float4 hi0 = _mm_movehl_ps(v1, v0);  
 
  372                                 z = _mm_shuffle_ps(hi0, v2, 0xa8);  
 
  374                                 float4 lo1 = _mm_movelh_ps(v2, v2);  
 
  376                                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  377                                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  384                                 float4 xy = _mm_movelh_ps(v0, v1);
 
  385                                 z = _mm_movehl_ps(v1, v0);
 
  387                                 z = _mm_shuffle_ps(z, z, 0xa8);
 
  388                                 x = _mm_shuffle_ps(xy, xy, 0xa8);
 
  389                                 y = _mm_shuffle_ps(xy, xy, 0xfd);
 
  395                                 float4 xy = vertices[0];
 
  396                                 z = _mm_shuffle_ps(xy, xy, 0xaa);
 
  399                                 x = _mm_shuffle_ps(xy, xy, 0);
 
  400                                 y = _mm_shuffle_ps(xy, xy, 0x55);
 
  406                 stack_array[index] = x;
 
  407                 max = _mm_max_ps(x, max);  
 
  412         if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
 
  415                 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
 
  416                 max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
 
  427                 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  
 
  430                 maxIndex = 4 * index + segment + indexTable[test];
 
  433         _mm_store_ss(dotResult, dotMax);
 
  437 long _mindot_large(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult);
 
  439 long _mindot_large(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult)
 
  441         const float4 *vertices = (
const float4 *)vv;
 
  442         static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
 
  444         float4 vvec = _mm_loadu_ps(vec);
 
  445         float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));  
 
  446         float4 vLo = _mm_movelh_ps(vvec, vvec);                                    
 
  451         float4 stack_array[STACK_ARRAY_COUNT];
 
  460         for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
 
  464                 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
 
  466                         float4 v0 = vertices[0];
 
  467                         float4 v1 = vertices[1];
 
  468                         float4 v2 = vertices[2];
 
  469                         float4 v3 = vertices[3];
 
  472                         float4 lo0 = _mm_movelh_ps(v0, v1);  
 
  473                         float4 hi0 = _mm_movehl_ps(v1, v0);  
 
  474                         float4 lo1 = _mm_movelh_ps(v2, v3);  
 
  475                         float4 hi1 = _mm_movehl_ps(v3, v2);  
 
  479                         float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  480                         float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  481                         float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  485                         stack_array[index] = x;
 
  486                         min = _mm_min_ps(x, min);  
 
  494                         lo0 = _mm_movelh_ps(v0, v1);  
 
  495                         hi0 = _mm_movehl_ps(v1, v0);  
 
  496                         lo1 = _mm_movelh_ps(v2, v3);  
 
  497                         hi1 = _mm_movehl_ps(v3, v2);  
 
  501                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  502                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  503                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  507                         stack_array[index + 1] = x;
 
  508                         min = _mm_min_ps(x, min);  
 
  516                         lo0 = _mm_movelh_ps(v0, v1);  
 
  517                         hi0 = _mm_movehl_ps(v1, v0);  
 
  518                         lo1 = _mm_movelh_ps(v2, v3);  
 
  519                         hi1 = _mm_movehl_ps(v3, v2);  
 
  523                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  524                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  525                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  529                         stack_array[index + 2] = x;
 
  530                         min = _mm_min_ps(x, min);  
 
  538                         lo0 = _mm_movelh_ps(v0, v1);  
 
  539                         hi0 = _mm_movehl_ps(v1, v0);  
 
  540                         lo1 = _mm_movelh_ps(v2, v3);  
 
  541                         hi1 = _mm_movehl_ps(v3, v2);  
 
  545                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  546                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  547                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  551                         stack_array[index + 3] = x;
 
  552                         min = _mm_min_ps(x, min);  
 
  558                 if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
 
  561                         min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
 
  562                         min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
 
  568                         for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  
 
  572                         minIndex = 4 * index + segment + indexTable[test];
 
  585                 for (; index + 4 <= count / 4; index += 4)
 
  587                         float4 v0 = vertices[0];
 
  588                         float4 v1 = vertices[1];
 
  589                         float4 v2 = vertices[2];
 
  590                         float4 v3 = vertices[3];
 
  593                         float4 lo0 = _mm_movelh_ps(v0, v1);  
 
  594                         float4 hi0 = _mm_movehl_ps(v1, v0);  
 
  595                         float4 lo1 = _mm_movelh_ps(v2, v3);  
 
  596                         float4 hi1 = _mm_movehl_ps(v3, v2);  
 
  600                         float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  601                         float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  602                         float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  606                         stack_array[index] = x;
 
  607                         min = _mm_min_ps(x, min);  
 
  615                         lo0 = _mm_movelh_ps(v0, v1);  
 
  616                         hi0 = _mm_movehl_ps(v1, v0);  
 
  617                         lo1 = _mm_movelh_ps(v2, v3);  
 
  618                         hi1 = _mm_movehl_ps(v3, v2);  
 
  622                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  623                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  624                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  628                         stack_array[index + 1] = x;
 
  629                         min = _mm_min_ps(x, min);  
 
  637                         lo0 = _mm_movelh_ps(v0, v1);  
 
  638                         hi0 = _mm_movehl_ps(v1, v0);  
 
  639                         lo1 = _mm_movelh_ps(v2, v3);  
 
  640                         hi1 = _mm_movehl_ps(v3, v2);  
 
  644                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  645                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  646                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  650                         stack_array[index + 2] = x;
 
  651                         min = _mm_min_ps(x, min);  
 
  659                         lo0 = _mm_movelh_ps(v0, v1);  
 
  660                         hi0 = _mm_movehl_ps(v1, v0);  
 
  661                         lo1 = _mm_movelh_ps(v2, v3);  
 
  662                         hi1 = _mm_movehl_ps(v3, v2);  
 
  666                         z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  667                         x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  668                         y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  672                         stack_array[index + 3] = x;
 
  673                         min = _mm_min_ps(x, min);  
 
  679         size_t localCount = (count & -4L) - 4 * index;
 
  683                 vertices += localCount;  
 
  684                 float4 t0, t1, t2, t3, t4;
 
  685                 size_t byteIndex = -(localCount) * 
sizeof(
float);
 
  686                 float4 *sap = &stack_array[index + localCount / 4];
 
  690              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\ 
  691              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\ 
  692              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\ 
  693              movaps  %[t0], %[min]                               // vertices[0]      \n\ 
  694              movlhps %[t1], %[min]                               // x0y0x1y1         \n\ 
  695              movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\ 
  696              movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\ 
  697              mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\ 
  698              movhlps %[t0], %[t1]                                // z0w0z1w1         \n\ 
  699              movaps  %[t3], %[t0]                                // vertices[2]      \n\ 
  700              movlhps %[t4], %[t0]                                // x2y2x3y3         \n\ 
  701              movhlps %[t3], %[t4]                                // z2w2z3w3         \n\ 
  702              mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\ 
  703              shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\ 
  704              mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\ 
  705              movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\ 
  706              shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\ 
  707              shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\ 
  708              addps   %[t3], %[min]                               // x + y            \n\ 
  709              addps   %[t1], %[min]                               // x + y + z        \n\ 
  710              movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\ 
  711              minps   %[t2], %[min]                               // record min, restore min   \n\ 
  712              add     $16, %[byteIndex]                           // advance loop counter\n\ 
  715                         : [min] 
"+x"(min), [t0] 
"=&x"(t0), [t1] 
"=&x"(t1), [t2] 
"=&x"(t2), [t3] 
"=&x"(t3), [t4] 
"=&x"(t4), [byteIndex] 
"+r"(byteIndex)
 
  716                         : [vLo] 
"x"(vLo), [vHi] 
"x"(vHi), [vertices] 
"r"(vertices), [sap] 
"r"(sap)
 
  718                 index += localCount / 4;
 
  721                         for (
unsigned int i = 0; i < localCount / 4; i++, index++)
 
  723                                 float4 v0 = vertices[0];
 
  724                                 float4 v1 = vertices[1];
 
  725                                 float4 v2 = vertices[2];
 
  726                                 float4 v3 = vertices[3];
 
  729                                 float4 lo0 = _mm_movelh_ps(v0, v1);  
 
  730                                 float4 hi0 = _mm_movehl_ps(v1, v0);  
 
  731                                 float4 lo1 = _mm_movelh_ps(v2, v3);  
 
  732                                 float4 hi1 = _mm_movehl_ps(v3, v2);  
 
  736                                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
 
  737                                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  738                                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  742                                 stack_array[index] = x;
 
  743                                 min = _mm_min_ps(x, min);  
 
  753                 float4 v0, v1, v2, x, y, z;
 
  763                                 float4 lo0 = _mm_movelh_ps(v0, v1);  
 
  764                                 float4 hi0 = _mm_movehl_ps(v1, v0);  
 
  766                                 z = _mm_shuffle_ps(hi0, v2, 0xa8);  
 
  768                                 float4 lo1 = _mm_movelh_ps(v2, v2);  
 
  770                                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
 
  771                                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
 
  778                                 float4 xy = _mm_movelh_ps(v0, v1);
 
  779                                 z = _mm_movehl_ps(v1, v0);
 
  781                                 z = _mm_shuffle_ps(z, z, 0xa8);
 
  782                                 x = _mm_shuffle_ps(xy, xy, 0xa8);
 
  783                                 y = _mm_shuffle_ps(xy, xy, 0xfd);
 
  789                                 float4 xy = vertices[0];
 
  790                                 z = _mm_shuffle_ps(xy, xy, 0xaa);
 
  793                                 x = _mm_shuffle_ps(xy, xy, 0);
 
  794                                 y = _mm_shuffle_ps(xy, xy, 0x55);
 
  800                 stack_array[index] = x;
 
  801                 min = _mm_min_ps(x, min);  
 
  806         if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
 
  809                 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
 
  810                 min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
 
  821                 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  
 
  824                 minIndex = 4 * index + segment + indexTable[test];
 
  827         _mm_store_ss(dotResult, dotmin);
 
  831 #elif defined BT_USE_NEON 
  833 #define ARM_NEON_GCC_COMPATIBILITY 1 
  834 #include <arm_neon.h> 
  835 #include <sys/types.h> 
  836 #include <sys/sysctl.h>   
  838 static long _maxdot_large_v0(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult);
 
  839 static long _maxdot_large_v1(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult);
 
  840 static long _maxdot_large_sel(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult);
 
  841 static long _mindot_large_v0(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult);
 
  842 static long _mindot_large_v1(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult);
 
  843 static long _mindot_large_sel(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult);
 
  845 long (*_maxdot_large)(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult) = _maxdot_large_sel;
 
  846 long (*_mindot_large)(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult) = _mindot_large_sel;
 
  848 static inline uint32_t btGetCpuCapabilities(
void)
 
  851         static bool testedCapabilities = 
false;
 
  853         if (0 == testedCapabilities)
 
  856                 size_t featureSize = 
sizeof(hasFeature);
 
  857                 int err = sysctlbyname(
"hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
 
  859                 if (0 == err && hasFeature)
 
  860                         capabilities |= 0x2000;
 
  862                 testedCapabilities = 
true;
 
  868 static long _maxdot_large_sel(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult)
 
  870         if (btGetCpuCapabilities() & 0x2000)
 
  871                 _maxdot_large = _maxdot_large_v1;
 
  873                 _maxdot_large = _maxdot_large_v0;
 
  875         return _maxdot_large(vv, vec, count, dotResult);
 
  878 static long _mindot_large_sel(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult)
 
  880         if (btGetCpuCapabilities() & 0x2000)
 
  881                 _mindot_large = _mindot_large_v1;
 
  883                 _mindot_large = _mindot_large_v0;
 
  885         return _mindot_large(vv, vec, count, dotResult);
 
  889 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) );  _r; }) 
  892 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L);  _r; }) 
  895 long _maxdot_large_v0(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult)
 
  898         float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
 
  899         float32x2_t vLo = vget_low_f32(vvec);
 
  900         float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
 
  903         uint32x2_t indexLo = (uint32x2_t){0, 1};
 
  904         uint32x2_t indexHi = (uint32x2_t){2, 3};
 
  905         uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
 
  906         uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
 
  907         const uint32x2_t four = (uint32x2_t){4, 4};
 
  909         for (; i + 8 <= count; i += 8)
 
  911                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
  912                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
  913                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
  914                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
 
  916                 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
  917                 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
  918                 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
 
  919                 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
 
  921                 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
  922                 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
 
  923                 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
 
  924                 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
 
  926                 float32x2_t rLo = vpadd_f32(xy0, xy1);
 
  927                 float32x2_t rHi = vpadd_f32(xy2, xy3);
 
  928                 rLo = vadd_f32(rLo, zLo);
 
  929                 rHi = vadd_f32(rHi, zHi);
 
  931                 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
 
  932                 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
 
  933                 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
 
  934                 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
 
  935                 iLo = vbsl_u32(maskLo, indexLo, iLo);
 
  936                 iHi = vbsl_u32(maskHi, indexHi, iHi);
 
  937                 indexLo = vadd_u32(indexLo, four);
 
  938                 indexHi = vadd_u32(indexHi, four);
 
  940                 v0 = vld1q_f32_aligned_postincrement(vv);
 
  941                 v1 = vld1q_f32_aligned_postincrement(vv);
 
  942                 v2 = vld1q_f32_aligned_postincrement(vv);
 
  943                 v3 = vld1q_f32_aligned_postincrement(vv);
 
  945                 xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
  946                 xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
  947                 xy2 = vmul_f32(vget_low_f32(v2), vLo);
 
  948                 xy3 = vmul_f32(vget_low_f32(v3), vLo);
 
  950                 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
  951                 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
 
  952                 zLo = vmul_f32(z0.val[0], vHi);
 
  953                 zHi = vmul_f32(z1.val[0], vHi);
 
  955                 rLo = vpadd_f32(xy0, xy1);
 
  956                 rHi = vpadd_f32(xy2, xy3);
 
  957                 rLo = vadd_f32(rLo, zLo);
 
  958                 rHi = vadd_f32(rHi, zHi);
 
  960                 maskLo = vcgt_f32(rLo, dotMaxLo);
 
  961                 maskHi = vcgt_f32(rHi, dotMaxHi);
 
  962                 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
 
  963                 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
 
  964                 iLo = vbsl_u32(maskLo, indexLo, iLo);
 
  965                 iHi = vbsl_u32(maskHi, indexHi, iHi);
 
  966                 indexLo = vadd_u32(indexLo, four);
 
  967                 indexHi = vadd_u32(indexHi, four);
 
  970         for (; i + 4 <= count; i += 4)
 
  972                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
  973                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
  974                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
  975                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
 
  977                 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
  978                 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
  979                 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
 
  980                 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
 
  982                 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
  983                 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
 
  984                 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
 
  985                 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
 
  987                 float32x2_t rLo = vpadd_f32(xy0, xy1);
 
  988                 float32x2_t rHi = vpadd_f32(xy2, xy3);
 
  989                 rLo = vadd_f32(rLo, zLo);
 
  990                 rHi = vadd_f32(rHi, zHi);
 
  992                 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
 
  993                 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
 
  994                 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
 
  995                 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
 
  996                 iLo = vbsl_u32(maskLo, indexLo, iLo);
 
  997                 iHi = vbsl_u32(maskHi, indexHi, iHi);
 
  998                 indexLo = vadd_u32(indexLo, four);
 
  999                 indexHi = vadd_u32(indexHi, four);
 
 1006                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1007                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1008                         float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1010                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1011                         float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
 1012                         float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
 
 1014                         float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1015                         float32x2_t zLo = vmul_f32(z0.val[0], vHi);
 
 1016                         float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
 
 1018                         float32x2_t rLo = vpadd_f32(xy0, xy1);
 
 1019                         float32x2_t rHi = vpadd_f32(xy2, xy2);
 
 1020                         rLo = vadd_f32(rLo, zLo);
 
 1021                         rHi = vadd_f32(rHi, zHi);
 
 1023                         uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
 
 1024                         uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
 
 1025                         dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
 
 1026                         dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
 
 1027                         iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1028                         iHi = vbsl_u32(maskHi, indexHi, iHi);
 
 1033                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1034                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1036                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1037                         float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
 1039                         float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1040                         float32x2_t zLo = vmul_f32(z0.val[0], vHi);
 
 1042                         float32x2_t rLo = vpadd_f32(xy0, xy1);
 
 1043                         rLo = vadd_f32(rLo, zLo);
 
 1045                         uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
 
 1046                         dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
 
 1047                         iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1052                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1053                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1054                         float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
 
 1055                         float32x2_t zLo = vmul_f32(z0, vHi);
 
 1056                         float32x2_t rLo = vpadd_f32(xy0, xy0);
 
 1057                         rLo = vadd_f32(rLo, zLo);
 
 1058                         uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
 
 1059                         dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
 
 1060                         iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1069         uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
 
 1070         dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
 
 1071         iLo = vbsl_u32(mask, iHi, iLo);
 
 1074         dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
 
 1075         iHi = vdup_lane_u32(iLo, 1);
 
 1076         mask = vcgt_f32(dotMaxHi, dotMaxLo);
 
 1077         dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
 
 1078         iLo = vbsl_u32(mask, iHi, iLo);
 
 1080         *dotResult = vget_lane_f32(dotMaxLo, 0);
 
 1081         return vget_lane_u32(iLo, 0);
 
 1084 long _maxdot_large_v1(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult)
 
 1086         float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
 
 1087         float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
 
 1088         float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
 
 1089         const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
 
 1090         uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
 
 1091         uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
 
 1094         unsigned long i = 0;
 
 1095         for (; i + 8 <= count; i += 8)
 
 1097                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1098                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1099                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1100                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
 
 1103                 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1104                 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
 
 1106                 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1107                 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1109                 xy0 = vmulq_f32(xy0, vLo);
 
 1110                 xy1 = vmulq_f32(xy1, vLo);
 
 1112                 float32x4x2_t zb = vuzpq_f32(z0, z1);
 
 1113                 float32x4_t z = vmulq_f32(zb.val[0], vHi);
 
 1114                 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
 
 1115                 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1116                 x = vaddq_f32(x, z);
 
 1118                 uint32x4_t mask = vcgtq_f32(x, maxDot);
 
 1119                 maxDot = vbslq_f32(mask, x, maxDot);
 
 1120                 index = vbslq_u32(mask, local_index, index);
 
 1121                 local_index = vaddq_u32(local_index, four);
 
 1123                 v0 = vld1q_f32_aligned_postincrement(vv);
 
 1124                 v1 = vld1q_f32_aligned_postincrement(vv);
 
 1125                 v2 = vld1q_f32_aligned_postincrement(vv);
 
 1126                 v3 = vld1q_f32_aligned_postincrement(vv);
 
 1129                 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1130                 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
 
 1132                 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1133                 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1135                 xy0 = vmulq_f32(xy0, vLo);
 
 1136                 xy1 = vmulq_f32(xy1, vLo);
 
 1138                 zb = vuzpq_f32(z0, z1);
 
 1139                 z = vmulq_f32(zb.val[0], vHi);
 
 1140                 xy = vuzpq_f32(xy0, xy1);
 
 1141                 x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1142                 x = vaddq_f32(x, z);
 
 1144                 mask = vcgtq_f32(x, maxDot);
 
 1145                 maxDot = vbslq_f32(mask, x, maxDot);
 
 1146                 index = vbslq_u32(mask, local_index, index);
 
 1147                 local_index = vaddq_u32(local_index, four);
 
 1150         for (; i + 4 <= count; i += 4)
 
 1152                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1153                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1154                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1155                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
 
 1158                 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1159                 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
 
 1161                 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1162                 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1164                 xy0 = vmulq_f32(xy0, vLo);
 
 1165                 xy1 = vmulq_f32(xy1, vLo);
 
 1167                 float32x4x2_t zb = vuzpq_f32(z0, z1);
 
 1168                 float32x4_t z = vmulq_f32(zb.val[0], vHi);
 
 1169                 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
 
 1170                 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1171                 x = vaddq_f32(x, z);
 
 1173                 uint32x4_t mask = vcgtq_f32(x, maxDot);
 
 1174                 maxDot = vbslq_f32(mask, x, maxDot);
 
 1175                 index = vbslq_u32(mask, local_index, index);
 
 1176                 local_index = vaddq_u32(local_index, four);
 
 1183                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1184                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1185                         float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1188                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1189                         float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
 
 1191                         float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1192                         float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
 
 1194                         xy0 = vmulq_f32(xy0, vLo);
 
 1195                         xy1 = vmulq_f32(xy1, vLo);
 
 1197                         float32x4x2_t zb = vuzpq_f32(z0, z1);
 
 1198                         float32x4_t z = vmulq_f32(zb.val[0], vHi);
 
 1199                         float32x4x2_t xy = vuzpq_f32(xy0, xy1);
 
 1200                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1201                         x = vaddq_f32(x, z);
 
 1203                         uint32x4_t mask = vcgtq_f32(x, maxDot);
 
 1204                         maxDot = vbslq_f32(mask, x, maxDot);
 
 1205                         index = vbslq_u32(mask, local_index, index);
 
 1206                         local_index = vaddq_u32(local_index, four);
 
 1212                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1213                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1216                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1218                         float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1220                         xy0 = vmulq_f32(xy0, vLo);
 
 1222                         float32x4x2_t zb = vuzpq_f32(z0, z0);
 
 1223                         float32x4_t z = vmulq_f32(zb.val[0], vHi);
 
 1224                         float32x4x2_t xy = vuzpq_f32(xy0, xy0);
 
 1225                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1226                         x = vaddq_f32(x, z);
 
 1228                         uint32x4_t mask = vcgtq_f32(x, maxDot);
 
 1229                         maxDot = vbslq_f32(mask, x, maxDot);
 
 1230                         index = vbslq_u32(mask, local_index, index);
 
 1231                         local_index = vaddq_u32(local_index, four);
 
 1237                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1240                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
 
 1242                         float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
 
 1244                         xy0 = vmulq_f32(xy0, vLo);
 
 1246                         z = vmulq_f32(z, vHi);
 
 1247                         float32x4x2_t xy = vuzpq_f32(xy0, xy0);
 
 1248                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1249                         x = vaddq_f32(x, z);
 
 1251                         uint32x4_t mask = vcgtq_f32(x, maxDot);
 
 1252                         maxDot = vbslq_f32(mask, x, maxDot);
 
 1253                         index = vbslq_u32(mask, local_index, index);
 
 1254                         local_index = vaddq_u32(local_index, four);
 
 1263         uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
 
 1264         float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
 
 1265         uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
 
 1268         float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
 
 1269         uint32x2_t indexHi = vdup_lane_u32(index2, 1);
 
 1270         mask = vcgt_f32(maxDotO, maxDot2);
 
 1271         maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
 
 1272         index2 = vbsl_u32(mask, indexHi, index2);
 
 1274         *dotResult = vget_lane_f32(maxDot2, 0);
 
 1275         return vget_lane_u32(index2, 0);
 
 1278 long _mindot_large_v0(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult)
 
 1280         unsigned long i = 0;
 
 1281         float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
 
 1282         float32x2_t vLo = vget_low_f32(vvec);
 
 1283         float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
 
 1286         uint32x2_t indexLo = (uint32x2_t){0, 1};
 
 1287         uint32x2_t indexHi = (uint32x2_t){2, 3};
 
 1288         uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
 
 1289         uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
 
 1290         const uint32x2_t four = (uint32x2_t){4, 4};
 
 1292         for (; i + 8 <= count; i += 8)
 
 1294                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1295                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1296                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1297                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
 
 1299                 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1300                 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
 1301                 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
 
 1302                 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
 
 1304                 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1305                 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1306                 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
 
 1307                 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
 
 1309                 float32x2_t rLo = vpadd_f32(xy0, xy1);
 
 1310                 float32x2_t rHi = vpadd_f32(xy2, xy3);
 
 1311                 rLo = vadd_f32(rLo, zLo);
 
 1312                 rHi = vadd_f32(rHi, zHi);
 
 1314                 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
 
 1315                 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
 
 1316                 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
 
 1317                 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
 
 1318                 iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1319                 iHi = vbsl_u32(maskHi, indexHi, iHi);
 
 1320                 indexLo = vadd_u32(indexLo, four);
 
 1321                 indexHi = vadd_u32(indexHi, four);
 
 1323                 v0 = vld1q_f32_aligned_postincrement(vv);
 
 1324                 v1 = vld1q_f32_aligned_postincrement(vv);
 
 1325                 v2 = vld1q_f32_aligned_postincrement(vv);
 
 1326                 v3 = vld1q_f32_aligned_postincrement(vv);
 
 1328                 xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1329                 xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
 1330                 xy2 = vmul_f32(vget_low_f32(v2), vLo);
 
 1331                 xy3 = vmul_f32(vget_low_f32(v3), vLo);
 
 1333                 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1334                 z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1335                 zLo = vmul_f32(z0.val[0], vHi);
 
 1336                 zHi = vmul_f32(z1.val[0], vHi);
 
 1338                 rLo = vpadd_f32(xy0, xy1);
 
 1339                 rHi = vpadd_f32(xy2, xy3);
 
 1340                 rLo = vadd_f32(rLo, zLo);
 
 1341                 rHi = vadd_f32(rHi, zHi);
 
 1343                 maskLo = vclt_f32(rLo, dotMinLo);
 
 1344                 maskHi = vclt_f32(rHi, dotMinHi);
 
 1345                 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
 
 1346                 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
 
 1347                 iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1348                 iHi = vbsl_u32(maskHi, indexHi, iHi);
 
 1349                 indexLo = vadd_u32(indexLo, four);
 
 1350                 indexHi = vadd_u32(indexHi, four);
 
 1353         for (; i + 4 <= count; i += 4)
 
 1355                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1356                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1357                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1358                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
 
 1360                 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1361                 float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
 1362                 float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
 
 1363                 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
 
 1365                 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1366                 float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1367                 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
 
 1368                 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
 
 1370                 float32x2_t rLo = vpadd_f32(xy0, xy1);
 
 1371                 float32x2_t rHi = vpadd_f32(xy2, xy3);
 
 1372                 rLo = vadd_f32(rLo, zLo);
 
 1373                 rHi = vadd_f32(rHi, zHi);
 
 1375                 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
 
 1376                 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
 
 1377                 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
 
 1378                 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
 
 1379                 iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1380                 iHi = vbsl_u32(maskHi, indexHi, iHi);
 
 1381                 indexLo = vadd_u32(indexLo, four);
 
 1382                 indexHi = vadd_u32(indexHi, four);
 
 1388                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1389                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1390                         float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1392                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1393                         float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
 1394                         float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
 
 1396                         float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1397                         float32x2_t zLo = vmul_f32(z0.val[0], vHi);
 
 1398                         float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
 
 1400                         float32x2_t rLo = vpadd_f32(xy0, xy1);
 
 1401                         float32x2_t rHi = vpadd_f32(xy2, xy2);
 
 1402                         rLo = vadd_f32(rLo, zLo);
 
 1403                         rHi = vadd_f32(rHi, zHi);
 
 1405                         uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
 
 1406                         uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
 
 1407                         dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
 
 1408                         dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
 
 1409                         iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1410                         iHi = vbsl_u32(maskHi, indexHi, iHi);
 
 1415                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1416                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1418                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1419                         float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
 
 1421                         float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1422                         float32x2_t zLo = vmul_f32(z0.val[0], vHi);
 
 1424                         float32x2_t rLo = vpadd_f32(xy0, xy1);
 
 1425                         rLo = vadd_f32(rLo, zLo);
 
 1427                         uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
 
 1428                         dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
 
 1429                         iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1434                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1435                         float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
 
 1436                         float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
 
 1437                         float32x2_t zLo = vmul_f32(z0, vHi);
 
 1438                         float32x2_t rLo = vpadd_f32(xy0, xy0);
 
 1439                         rLo = vadd_f32(rLo, zLo);
 
 1440                         uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
 
 1441                         dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
 
 1442                         iLo = vbsl_u32(maskLo, indexLo, iLo);
 
 1451         uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
 
 1452         dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
 
 1453         iLo = vbsl_u32(mask, iHi, iLo);
 
 1456         dotMinHi = vdup_lane_f32(dotMinLo, 1);
 
 1457         iHi = vdup_lane_u32(iLo, 1);
 
 1458         mask = vclt_f32(dotMinHi, dotMinLo);
 
 1459         dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
 
 1460         iLo = vbsl_u32(mask, iHi, iLo);
 
 1462         *dotResult = vget_lane_f32(dotMinLo, 0);
 
 1463         return vget_lane_u32(iLo, 0);
 
 1466 long _mindot_large_v1(
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult)
 
 1468         float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
 
 1469         float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
 
 1470         float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
 
 1471         const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
 
 1472         uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
 
 1473         uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
 
 1476         unsigned long i = 0;
 
 1477         for (; i + 8 <= count; i += 8)
 
 1479                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1480                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1481                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1482                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
 
 1485                 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1486                 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
 
 1488                 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1489                 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1491                 xy0 = vmulq_f32(xy0, vLo);
 
 1492                 xy1 = vmulq_f32(xy1, vLo);
 
 1494                 float32x4x2_t zb = vuzpq_f32(z0, z1);
 
 1495                 float32x4_t z = vmulq_f32(zb.val[0], vHi);
 
 1496                 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
 
 1497                 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1498                 x = vaddq_f32(x, z);
 
 1500                 uint32x4_t mask = vcltq_f32(x, minDot);
 
 1501                 minDot = vbslq_f32(mask, x, minDot);
 
 1502                 index = vbslq_u32(mask, local_index, index);
 
 1503                 local_index = vaddq_u32(local_index, four);
 
 1505                 v0 = vld1q_f32_aligned_postincrement(vv);
 
 1506                 v1 = vld1q_f32_aligned_postincrement(vv);
 
 1507                 v2 = vld1q_f32_aligned_postincrement(vv);
 
 1508                 v3 = vld1q_f32_aligned_postincrement(vv);
 
 1511                 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1512                 xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
 
 1514                 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1515                 z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1517                 xy0 = vmulq_f32(xy0, vLo);
 
 1518                 xy1 = vmulq_f32(xy1, vLo);
 
 1520                 zb = vuzpq_f32(z0, z1);
 
 1521                 z = vmulq_f32(zb.val[0], vHi);
 
 1522                 xy = vuzpq_f32(xy0, xy1);
 
 1523                 x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1524                 x = vaddq_f32(x, z);
 
 1526                 mask = vcltq_f32(x, minDot);
 
 1527                 minDot = vbslq_f32(mask, x, minDot);
 
 1528                 index = vbslq_u32(mask, local_index, index);
 
 1529                 local_index = vaddq_u32(local_index, four);
 
 1532         for (; i + 4 <= count; i += 4)
 
 1534                 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1535                 float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1536                 float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1537                 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
 
 1540                 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1541                 float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
 
 1543                 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1544                 float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
 
 1546                 xy0 = vmulq_f32(xy0, vLo);
 
 1547                 xy1 = vmulq_f32(xy1, vLo);
 
 1549                 float32x4x2_t zb = vuzpq_f32(z0, z1);
 
 1550                 float32x4_t z = vmulq_f32(zb.val[0], vHi);
 
 1551                 float32x4x2_t xy = vuzpq_f32(xy0, xy1);
 
 1552                 float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1553                 x = vaddq_f32(x, z);
 
 1555                 uint32x4_t mask = vcltq_f32(x, minDot);
 
 1556                 minDot = vbslq_f32(mask, x, minDot);
 
 1557                 index = vbslq_u32(mask, local_index, index);
 
 1558                 local_index = vaddq_u32(local_index, four);
 
 1565                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1566                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1567                         float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
 
 1570                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1571                         float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
 
 1573                         float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1574                         float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
 
 1576                         xy0 = vmulq_f32(xy0, vLo);
 
 1577                         xy1 = vmulq_f32(xy1, vLo);
 
 1579                         float32x4x2_t zb = vuzpq_f32(z0, z1);
 
 1580                         float32x4_t z = vmulq_f32(zb.val[0], vHi);
 
 1581                         float32x4x2_t xy = vuzpq_f32(xy0, xy1);
 
 1582                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1583                         x = vaddq_f32(x, z);
 
 1585                         uint32x4_t mask = vcltq_f32(x, minDot);
 
 1586                         minDot = vbslq_f32(mask, x, minDot);
 
 1587                         index = vbslq_u32(mask, local_index, index);
 
 1588                         local_index = vaddq_u32(local_index, four);
 
 1594                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1595                         float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
 
 1598                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
 
 1600                         float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
 
 1602                         xy0 = vmulq_f32(xy0, vLo);
 
 1604                         float32x4x2_t zb = vuzpq_f32(z0, z0);
 
 1605                         float32x4_t z = vmulq_f32(zb.val[0], vHi);
 
 1606                         float32x4x2_t xy = vuzpq_f32(xy0, xy0);
 
 1607                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1608                         x = vaddq_f32(x, z);
 
 1610                         uint32x4_t mask = vcltq_f32(x, minDot);
 
 1611                         minDot = vbslq_f32(mask, x, minDot);
 
 1612                         index = vbslq_u32(mask, local_index, index);
 
 1613                         local_index = vaddq_u32(local_index, four);
 
 1619                         float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
 
 1622                         float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
 
 1624                         float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
 
 1626                         xy0 = vmulq_f32(xy0, vLo);
 
 1628                         z = vmulq_f32(z, vHi);
 
 1629                         float32x4x2_t xy = vuzpq_f32(xy0, xy0);
 
 1630                         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
 
 1631                         x = vaddq_f32(x, z);
 
 1633                         uint32x4_t mask = vcltq_f32(x, minDot);
 
 1634                         minDot = vbslq_f32(mask, x, minDot);
 
 1635                         index = vbslq_u32(mask, local_index, index);
 
 1636                         local_index = vaddq_u32(local_index, four);
 
 1645         uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
 
 1646         float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
 
 1647         uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
 
 1650         float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
 
 1651         uint32x2_t indexHi = vdup_lane_u32(index2, 1);
 
 1652         mask = vclt_f32(minDotO, minDot2);
 
 1653         minDot2 = vbsl_f32(mask, minDotO, minDot2);
 
 1654         index2 = vbsl_u32(mask, indexHi, index2);
 
 1656         *dotResult = vget_lane_f32(minDot2, 0);
 
 1657         return vget_lane_u32(index2, 0);
 
 1661 #error Unhandled __APPLE__ arch