18 #if defined (_WIN32) || defined (__i386__)    19 #define BT_USE_SSE_IN_API    27 #if defined BT_USE_SIMD_VECTOR3    36 typedef  float float4 __attribute__ ((vector_size(16)));
    43 #if defined BT_USE_SSE || defined _WIN32    45 #define LOG2_ARRAY_SIZE     6    46 #define STACK_ARRAY_COUNT   (1UL << LOG2_ARRAY_SIZE)    48 #include <emmintrin.h>    50 long _maxdot_large( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult );
    51 long _maxdot_large( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult )
    53     const float4 *vertices = (
const float4*) vv;
    54     static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
    56     float4 vvec = _mm_loadu_ps( vec );
    57     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          
    58     float4 vLo = _mm_movelh_ps( vvec, vvec );                               
    63     float4 stack_array[ STACK_ARRAY_COUNT ];
    72     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
    76         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
    78             float4 v0 = vertices[0];
    79             float4 v1 = vertices[1];
    80             float4 v2 = vertices[2];
    81             float4 v3 = vertices[3];            vertices += 4;
    83             float4 lo0 = _mm_movelh_ps( v0, v1);    
    84             float4 hi0 = _mm_movehl_ps( v1, v0);    
    85             float4 lo1 = _mm_movelh_ps( v2, v3);    
    86             float4 hi1 = _mm_movehl_ps( v3, v2);    
    90             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
    91             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
    92             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
    96             stack_array[index] = x;
    97             max = _mm_max_ps( x, max );         
   102             v3 = vertices[3];            vertices += 4;
   104             lo0 = _mm_movelh_ps( v0, v1);    
   105             hi0 = _mm_movehl_ps( v1, v0);    
   106             lo1 = _mm_movelh_ps( v2, v3);    
   107             hi1 = _mm_movehl_ps( v3, v2);    
   111             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   112             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   113             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   117             stack_array[index+1] = x;
   118             max = _mm_max_ps( x, max );         
   123             v3 = vertices[3];            vertices += 4;
   125             lo0 = _mm_movelh_ps( v0, v1);    
   126             hi0 = _mm_movehl_ps( v1, v0);    
   127             lo1 = _mm_movelh_ps( v2, v3);    
   128             hi1 = _mm_movehl_ps( v3, v2);    
   132             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   133             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   134             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   138             stack_array[index+2] = x;
   139             max = _mm_max_ps( x, max );         
   144             v3 = vertices[3];            vertices += 4;
   146             lo0 = _mm_movelh_ps( v0, v1);    
   147             hi0 = _mm_movehl_ps( v1, v0);    
   148             lo1 = _mm_movelh_ps( v2, v3);    
   149             hi1 = _mm_movehl_ps( v3, v2);    
   153             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   154             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   155             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   159             stack_array[index+3] = x;
   160             max = _mm_max_ps( x, max );         
   166         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
   169             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
   170             max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
   176             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   
   179             maxIndex = 4*index + segment + indexTable[test];
   193         for( ; index + 4 <= count / 4; index+=4 )   
   195             float4 v0 = vertices[0];
   196             float4 v1 = vertices[1];
   197             float4 v2 = vertices[2];
   198             float4 v3 = vertices[3];            vertices += 4;
   200             float4 lo0 = _mm_movelh_ps( v0, v1);    
   201             float4 hi0 = _mm_movehl_ps( v1, v0);    
   202             float4 lo1 = _mm_movelh_ps( v2, v3);    
   203             float4 hi1 = _mm_movehl_ps( v3, v2);    
   207             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
   208             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
   209             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   213             stack_array[index] = x;
   214             max = _mm_max_ps( x, max );         
   219             v3 = vertices[3];            vertices += 4;
   221             lo0 = _mm_movelh_ps( v0, v1);    
   222             hi0 = _mm_movehl_ps( v1, v0);    
   223             lo1 = _mm_movelh_ps( v2, v3);    
   224             hi1 = _mm_movehl_ps( v3, v2);    
   228             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   229             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   230             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   234             stack_array[index+1] = x;
   235             max = _mm_max_ps( x, max );         
   240             v3 = vertices[3];            vertices += 4;
   242             lo0 = _mm_movelh_ps( v0, v1);    
   243             hi0 = _mm_movehl_ps( v1, v0);    
   244             lo1 = _mm_movelh_ps( v2, v3);    
   245             hi1 = _mm_movehl_ps( v3, v2);    
   249             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   250             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   251             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   255             stack_array[index+2] = x;
   256             max = _mm_max_ps( x, max );         
   261             v3 = vertices[3];            vertices += 4;
   263             lo0 = _mm_movelh_ps( v0, v1);    
   264             hi0 = _mm_movehl_ps( v1, v0);    
   265             lo1 = _mm_movelh_ps( v2, v3);    
   266             hi1 = _mm_movehl_ps( v3, v2);    
   270             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   271             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   272             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   276             stack_array[index+3] = x;
   277             max = _mm_max_ps( x, max );         
   283     size_t localCount = (count & -4L) - 4*index;
   287         float4 t0, t1, t2, t3, t4;
   288         float4 * sap = &stack_array[index + localCount / 4];
   289           vertices += localCount;      
   290          size_t byteIndex = -(localCount) * 
sizeof(
float);
   294              0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\   295           movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\   296           movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\   297           movaps  %[t0], %[max]                               // vertices[0]      \n\   298           movlhps %[t1], %[max]                               // x0y0x1y1         \n\   299          movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\   300          movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\   301           mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\   302          movhlps %[t0], %[t1]                                // z0w0z1w1         \n\   303          movaps  %[t3], %[t0]                                // vertices[2]      \n\   304          movlhps %[t4], %[t0]                                // x2y2x3y3         \n\   305          mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\   306           movhlps %[t3], %[t4]                                // z2w2z3w3         \n\   307           shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\   308           mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\   309          movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\   310          shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\   311          shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\   312          addps   %[t3], %[max]                               // x + y            \n\   313          addps   %[t1], %[max]                               // x + y + z        \n\   314          movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\   315          maxps   %[t2], %[max]                               // record max, restore max   \n\   316          add     $16, %[byteIndex]                           // advance loop counter\n\   319          : [max] 
"+x" (max), [t0] 
"=&x" (t0), [t1] 
"=&x" (t1), [t2] 
"=&x" (t2), [t3] 
"=&x" (t3), [t4] 
"=&x" (t4), [byteIndex] 
"+r" (byteIndex)
   320          : [vLo] 
"x" (vLo), [vHi] 
"x" (vHi), [vertices] 
"r" (vertices), [sap] 
"r" (sap)
   323         index += localCount/4;
   326             for( 
unsigned int i=0; i<localCount/4; i++,index++)   
   328                 float4 v0 = vertices[0];
   329                 float4 v1 = vertices[1];
   330                 float4 v2 = vertices[2];
   331                 float4 v3 = vertices[3];            
   334                 float4 lo0 = _mm_movelh_ps( v0, v1);    
   335                 float4 hi0 = _mm_movehl_ps( v1, v0);    
   336                 float4 lo1 = _mm_movelh_ps( v2, v3);    
   337                 float4 hi1 = _mm_movehl_ps( v3, v2);    
   341                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
   342                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
   343                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   347                 stack_array[index] = x;
   348                 max = _mm_max_ps( x, max );         
   357         float4 v0, v1, v2, x, y, z;
   367                 float4 lo0 = _mm_movelh_ps( v0, v1);        
   368                 float4 hi0 = _mm_movehl_ps( v1, v0);        
   370                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           
   372                 float4 lo1 = _mm_movelh_ps(v2, v2);          
   374                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
   375                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   382                 float4 xy = _mm_movelh_ps(v0, v1);
   383                 z = _mm_movehl_ps(v1, v0);
   385                 z = _mm_shuffle_ps( z, z,  0xa8);
   386                 x = _mm_shuffle_ps( xy, xy, 0xa8);
   387                 y = _mm_shuffle_ps( xy, xy, 0xfd);
   393                 float4 xy = vertices[0];
   394                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
   397                 x = _mm_shuffle_ps(xy, xy, 0);
   398                 y = _mm_shuffle_ps(xy, xy, 0x55);
   404         stack_array[index] = x;
   405         max = _mm_max_ps( x, max );         
   410     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
   413         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
   414         max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
   425         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ )   
   427         maxIndex = 4*index + segment + indexTable[test];
   430     _mm_store_ss( dotResult, dotMax);
   434 long _mindot_large( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult );
   436 long _mindot_large( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult )
   438     const float4 *vertices = (
const float4*) vv;
   439     static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
   441     float4 vvec = _mm_loadu_ps( vec );
   442     float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));          
   443     float4 vLo = _mm_movelh_ps( vvec, vvec );                               
   448     float4 stack_array[ STACK_ARRAY_COUNT ];
   457     for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 ) 
   461         for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )   
   463             float4 v0 = vertices[0];
   464             float4 v1 = vertices[1];
   465             float4 v2 = vertices[2];
   466             float4 v3 = vertices[3];            vertices += 4;
   468             float4 lo0 = _mm_movelh_ps( v0, v1);    
   469             float4 hi0 = _mm_movehl_ps( v1, v0);    
   470             float4 lo1 = _mm_movelh_ps( v2, v3);    
   471             float4 hi1 = _mm_movehl_ps( v3, v2);    
   475             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
   476             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
   477             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   481             stack_array[index] = x;
   482             min = _mm_min_ps( x, min );         
   487             v3 = vertices[3];            vertices += 4;
   489             lo0 = _mm_movelh_ps( v0, v1);    
   490             hi0 = _mm_movehl_ps( v1, v0);    
   491             lo1 = _mm_movelh_ps( v2, v3);    
   492             hi1 = _mm_movehl_ps( v3, v2);    
   496             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   497             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   498             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   502             stack_array[index+1] = x;
   503             min = _mm_min_ps( x, min );         
   508             v3 = vertices[3];            vertices += 4;
   510             lo0 = _mm_movelh_ps( v0, v1);    
   511             hi0 = _mm_movehl_ps( v1, v0);    
   512             lo1 = _mm_movelh_ps( v2, v3);    
   513             hi1 = _mm_movehl_ps( v3, v2);    
   517             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   518             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   519             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   523             stack_array[index+2] = x;
   524             min = _mm_min_ps( x, min );         
   529             v3 = vertices[3];            vertices += 4;
   531             lo0 = _mm_movelh_ps( v0, v1);    
   532             hi0 = _mm_movehl_ps( v1, v0);    
   533             lo1 = _mm_movelh_ps( v2, v3);    
   534             hi1 = _mm_movehl_ps( v3, v2);    
   538             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   539             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   540             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   544             stack_array[index+3] = x;
   545             min = _mm_min_ps( x, min );         
   551         if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
   554             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
   555             min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
   561             for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   
   564             minIndex = 4*index + segment + indexTable[test];
   578         for( ; index + 4 <= count / 4; index+=4 )   
   580             float4 v0 = vertices[0];
   581             float4 v1 = vertices[1];
   582             float4 v2 = vertices[2];
   583             float4 v3 = vertices[3];            vertices += 4;
   585             float4 lo0 = _mm_movelh_ps( v0, v1);    
   586             float4 hi0 = _mm_movehl_ps( v1, v0);    
   587             float4 lo1 = _mm_movelh_ps( v2, v3);    
   588             float4 hi1 = _mm_movehl_ps( v3, v2);    
   592             float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
   593             float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
   594             float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   598             stack_array[index] = x;
   599             min = _mm_min_ps( x, min );         
   604             v3 = vertices[3];            vertices += 4;
   606             lo0 = _mm_movelh_ps( v0, v1);    
   607             hi0 = _mm_movehl_ps( v1, v0);    
   608             lo1 = _mm_movelh_ps( v2, v3);    
   609             hi1 = _mm_movehl_ps( v3, v2);    
   613             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   614             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   615             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   619             stack_array[index+1] = x;
   620             min = _mm_min_ps( x, min );         
   625             v3 = vertices[3];            vertices += 4;
   627             lo0 = _mm_movelh_ps( v0, v1);    
   628             hi0 = _mm_movehl_ps( v1, v0);    
   629             lo1 = _mm_movelh_ps( v2, v3);    
   630             hi1 = _mm_movehl_ps( v3, v2);    
   634             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   635             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   636             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   640             stack_array[index+2] = x;
   641             min = _mm_min_ps( x, min );         
   646             v3 = vertices[3];            vertices += 4;
   648             lo0 = _mm_movelh_ps( v0, v1);    
   649             hi0 = _mm_movehl_ps( v1, v0);    
   650             lo1 = _mm_movelh_ps( v2, v3);    
   651             hi1 = _mm_movehl_ps( v3, v2);    
   655             z = _mm_shuffle_ps(hi0, hi1, 0x88);
   656             x = _mm_shuffle_ps(lo0, lo1, 0x88);
   657             y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   661             stack_array[index+3] = x;
   662             min = _mm_min_ps( x, min );         
   668     size_t localCount = (count & -4L) - 4*index;
   674         vertices += localCount;      
   675         float4 t0, t1, t2, t3, t4;
   676         size_t byteIndex = -(localCount) * 
sizeof(
float);
   677         float4 * sap = &stack_array[index + localCount / 4];
   681              0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\   682              movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\   683              movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\   684              movaps  %[t0], %[min]                               // vertices[0]      \n\   685              movlhps %[t1], %[min]                               // x0y0x1y1         \n\   686              movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\   687              movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\   688              mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\   689              movhlps %[t0], %[t1]                                // z0w0z1w1         \n\   690              movaps  %[t3], %[t0]                                // vertices[2]      \n\   691              movlhps %[t4], %[t0]                                // x2y2x3y3         \n\   692              movhlps %[t3], %[t4]                                // z2w2z3w3         \n\   693              mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\   694              shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\   695              mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\   696              movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\   697              shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\   698              shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\   699              addps   %[t3], %[min]                               // x + y            \n\   700              addps   %[t1], %[min]                               // x + y + z        \n\   701              movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\   702              minps   %[t2], %[min]                               // record min, restore min   \n\   703              add     $16, %[byteIndex]                           // advance loop counter\n\   706          : [min] 
"+x" (min), [t0] 
"=&x" (t0), [t1] 
"=&x" (t1), [t2] 
"=&x" (t2), [t3] 
"=&x" (t3), [t4] 
"=&x" (t4), [byteIndex] 
"+r" (byteIndex)
   707          : [vLo] 
"x" (vLo), [vHi] 
"x" (vHi), [vertices] 
"r" (vertices), [sap] 
"r" (sap)
   710         index += localCount/4;
   713             for( 
unsigned int i=0; i<localCount/4; i++,index++)   
   715                 float4 v0 = vertices[0];
   716                 float4 v1 = vertices[1];
   717                 float4 v2 = vertices[2];
   718                 float4 v3 = vertices[3];            
   721                 float4 lo0 = _mm_movelh_ps( v0, v1);    
   722                 float4 hi0 = _mm_movehl_ps( v1, v0);    
   723                 float4 lo1 = _mm_movelh_ps( v2, v3);    
   724                 float4 hi1 = _mm_movehl_ps( v3, v2);    
   728                 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
   729                 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
   730                 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   734                 stack_array[index] = x;
   735                 min = _mm_min_ps( x, min );         
   745         float4 v0, v1, v2, x, y, z;
   755                 float4 lo0 = _mm_movelh_ps( v0, v1);        
   756                 float4 hi0 = _mm_movehl_ps( v1, v0);        
   758                 z = _mm_shuffle_ps(hi0, v2,  0xa8 );           
   760                 float4 lo1 = _mm_movelh_ps(v2, v2);          
   762                 x = _mm_shuffle_ps(lo0, lo1, 0x88);
   763                 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
   770                 float4 xy = _mm_movelh_ps(v0, v1);
   771                 z = _mm_movehl_ps(v1, v0);
   773                 z = _mm_shuffle_ps( z, z,  0xa8);
   774                 x = _mm_shuffle_ps( xy, xy, 0xa8);
   775                 y = _mm_shuffle_ps( xy, xy, 0xfd);
   781                 float4 xy = vertices[0];
   782                 z =  _mm_shuffle_ps( xy, xy, 0xaa);
   785                 x = _mm_shuffle_ps(xy, xy, 0);
   786                 y = _mm_shuffle_ps(xy, xy, 0x55);
   792         stack_array[index] = x;
   793         min = _mm_min_ps( x, min );         
   798     if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
   801         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
   802         min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
   813         for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ )   
   815         minIndex = 4*index + segment + indexTable[test];
   818     _mm_store_ss( dotResult, dotmin);
   823 #elif defined BT_USE_NEON   825 #define ARM_NEON_GCC_COMPATIBILITY  1   826 #include <arm_neon.h>   827 #include <sys/types.h>   828 #include <sys/sysctl.h>    830 static long _maxdot_large_v0( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult );
   831 static long _maxdot_large_v1( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult );
   832 static long _maxdot_large_sel( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult );
   833 static long _mindot_large_v0( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult );
   834 static long _mindot_large_v1( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult );
   835 static long _mindot_large_sel( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult );
   837 long (*_maxdot_large)( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult ) = _maxdot_large_sel;
   838 long (*_mindot_large)( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult ) = _mindot_large_sel;
   841 static inline uint32_t btGetCpuCapabilities( 
void )
   844     static bool testedCapabilities = 
false;
   846     if( 0 == testedCapabilities)
   849         size_t featureSize = 
sizeof( hasFeature );
   850         int err = sysctlbyname( 
"hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
   852         if( 0 == err && hasFeature)
   853             capabilities |= 0x2000;
   855                 testedCapabilities = 
true;
   864 static long _maxdot_large_sel( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult )
   867     if( btGetCpuCapabilities() & 0x2000 )
   868         _maxdot_large = _maxdot_large_v1;
   870         _maxdot_large = _maxdot_large_v0;
   872     return _maxdot_large(vv, vec, count, dotResult);
   875 static long _mindot_large_sel( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult )
   878     if( btGetCpuCapabilities() & 0x2000 )
   879         _mindot_large = _mindot_large_v1;
   881         _mindot_large = _mindot_large_v0;
   883     return _mindot_large(vv, vec, count, dotResult);
   889 # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) );  _r; })   892 # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L);  _r; })   896 long _maxdot_large_v0( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult )
   899     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
   900     float32x2_t vLo = vget_low_f32(vvec);
   901     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
   903     float32x2_t dotMaxHi = (float32x2_t) { -
BT_INFINITY, -BT_INFINITY };
   904     uint32x2_t indexLo = (uint32x2_t) {0, 1};
   905     uint32x2_t indexHi = (uint32x2_t) {2, 3};
   906     uint32x2_t iLo = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
   907     uint32x2_t iHi = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
   908     const uint32x2_t four = (uint32x2_t) {4,4};
   910     for( ; i+8 <= count; i+= 8 )
   912         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   913         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   914         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   915         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
   917         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   918         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
   919         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
   920         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
   922         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   923         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
   924         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
   925         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
   927         float32x2_t rLo = vpadd_f32( xy0, xy1);
   928         float32x2_t rHi = vpadd_f32( xy2, xy3);
   929         rLo = vadd_f32(rLo, zLo);
   930         rHi = vadd_f32(rHi, zHi);
   932         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
   933         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
   934         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
   935         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
   936         iLo = vbsl_u32(maskLo, indexLo, iLo);
   937         iHi = vbsl_u32(maskHi, indexHi, iHi);
   938         indexLo = vadd_u32(indexLo, four); 
   939         indexHi = vadd_u32(indexHi, four);
   941         v0 = vld1q_f32_aligned_postincrement( vv );
   942         v1 = vld1q_f32_aligned_postincrement( vv );
   943         v2 = vld1q_f32_aligned_postincrement( vv );
   944         v3 = vld1q_f32_aligned_postincrement( vv );
   946         xy0 = vmul_f32( vget_low_f32(v0), vLo);
   947         xy1 = vmul_f32( vget_low_f32(v1), vLo);
   948         xy2 = vmul_f32( vget_low_f32(v2), vLo);
   949         xy3 = vmul_f32( vget_low_f32(v3), vLo);
   951         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   952         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
   953         zLo = vmul_f32( z0.val[0], vHi);
   954         zHi = vmul_f32( z1.val[0], vHi);
   956         rLo = vpadd_f32( xy0, xy1);
   957         rHi = vpadd_f32( xy2, xy3);
   958         rLo = vadd_f32(rLo, zLo);
   959         rHi = vadd_f32(rHi, zHi);
   961         maskLo = vcgt_f32( rLo, dotMaxLo );
   962         maskHi = vcgt_f32( rHi, dotMaxHi );
   963         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
   964         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
   965         iLo = vbsl_u32(maskLo, indexLo, iLo);
   966         iHi = vbsl_u32(maskHi, indexHi, iHi);
   967         indexLo = vadd_u32(indexLo, four);
   968         indexHi = vadd_u32(indexHi, four);
   971     for( ; i+4 <= count; i+= 4 )
   973         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
   974         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
   975         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
   976         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
   978         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
   979         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
   980         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
   981         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
   983         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
   984         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
   985         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
   986         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
   988         float32x2_t rLo = vpadd_f32( xy0, xy1);
   989         float32x2_t rHi = vpadd_f32( xy2, xy3);
   990         rLo = vadd_f32(rLo, zLo);
   991         rHi = vadd_f32(rHi, zHi);
   993         uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
   994         uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
   995         dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
   996         dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
   997         iLo = vbsl_u32(maskLo, indexLo, iLo);
   998         iHi = vbsl_u32(maskHi, indexHi, iHi);
   999         indexLo = vadd_u32(indexLo, four);
  1000         indexHi = vadd_u32(indexHi, four);
  1007             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1008             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1009             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1011             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1012             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1013             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1015             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1016             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1017             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
  1019             float32x2_t rLo = vpadd_f32( xy0, xy1);
  1020             float32x2_t rHi = vpadd_f32( xy2, xy2);
  1021             rLo = vadd_f32(rLo, zLo);
  1022             rHi = vadd_f32(rHi, zHi);
  1024             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
  1025             uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
  1026             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  1027             dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
  1028             iLo = vbsl_u32(maskLo, indexLo, iLo);
  1029             iHi = vbsl_u32(maskHi, indexHi, iHi);
  1034             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1035             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1037             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1038             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1040             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1041             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1043             float32x2_t rLo = vpadd_f32( xy0, xy1);
  1044             rLo = vadd_f32(rLo, zLo);
  1046             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
  1047             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  1048             iLo = vbsl_u32(maskLo, indexLo, iLo);
  1053             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1054             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1055             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
  1056             float32x2_t zLo = vmul_f32( z0, vHi);
  1057             float32x2_t rLo = vpadd_f32( xy0, xy0);
  1058             rLo = vadd_f32(rLo, zLo);
  1059             uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
  1060             dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  1061             iLo = vbsl_u32(maskLo, indexLo, iLo);
  1070     uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
  1071     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
  1072     iLo = vbsl_u32(mask, iHi, iLo);
  1075     dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
  1076     iHi = vdup_lane_u32(iLo, 1);
  1077     mask = vcgt_f32( dotMaxHi, dotMaxLo );
  1078     dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
  1079     iLo = vbsl_u32(mask, iHi, iLo);
  1081     *dotResult = vget_lane_f32( dotMaxLo, 0);
  1082     return vget_lane_u32(iLo, 0);
  1086 long _maxdot_large_v1( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult )
  1088     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
  1089     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
  1090     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
  1091     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
  1092     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
  1093     uint32x4_t index = (uint32x4_t) { 
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
  1096     unsigned long i = 0;
  1097     for( ; i + 8 <= count; i += 8 )
  1099         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1100         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1101         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1102         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1105         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1106         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1108         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1109         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1111         xy0 = vmulq_f32(xy0, vLo);
  1112         xy1 = vmulq_f32(xy1, vLo);
  1114         float32x4x2_t zb = vuzpq_f32( z0, z1);
  1115         float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1116         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1117         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1118         x = vaddq_f32(x, z);
  1120         uint32x4_t mask = vcgtq_f32(x, maxDot);
  1121         maxDot = vbslq_f32( mask, x, maxDot);
  1122         index = vbslq_u32(mask, local_index, index);
  1123         local_index = vaddq_u32(local_index, four);
  1125         v0 = vld1q_f32_aligned_postincrement( vv );
  1126         v1 = vld1q_f32_aligned_postincrement( vv );
  1127         v2 = vld1q_f32_aligned_postincrement( vv );
  1128         v3 = vld1q_f32_aligned_postincrement( vv );
  1131         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1132         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1134         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1135         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1137         xy0 = vmulq_f32(xy0, vLo);
  1138         xy1 = vmulq_f32(xy1, vLo);
  1140         zb = vuzpq_f32( z0, z1);
  1141         z = vmulq_f32( zb.val[0], vHi);
  1142         xy = vuzpq_f32( xy0, xy1);
  1143         x = vaddq_f32(xy.val[0], xy.val[1]);
  1144         x = vaddq_f32(x, z);
  1146         mask = vcgtq_f32(x, maxDot);
  1147         maxDot = vbslq_f32( mask, x, maxDot);
  1148         index = vbslq_u32(mask, local_index, index);
  1149         local_index = vaddq_u32(local_index, four);
  1152     for( ; i + 4 <= count; i += 4 )
  1154         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1155         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1156         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1157         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1160         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1161         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1163         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1164         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1166         xy0 = vmulq_f32(xy0, vLo);
  1167         xy1 = vmulq_f32(xy1, vLo);
  1169         float32x4x2_t zb = vuzpq_f32( z0, z1);
  1170         float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1171         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1172         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1173         x = vaddq_f32(x, z);
  1175         uint32x4_t mask = vcgtq_f32(x, maxDot);
  1176         maxDot = vbslq_f32( mask, x, maxDot);
  1177         index = vbslq_u32(mask, local_index, index);
  1178         local_index = vaddq_u32(local_index, four);
  1181     switch (count & 3) {
  1184             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1185             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1186             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1189             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1190             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
  1192             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1193             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
  1195             xy0 = vmulq_f32(xy0, vLo);
  1196             xy1 = vmulq_f32(xy1, vLo);
  1198             float32x4x2_t zb = vuzpq_f32( z0, z1);
  1199             float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1200             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1201             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1202             x = vaddq_f32(x, z);
  1204             uint32x4_t mask = vcgtq_f32(x, maxDot);
  1205             maxDot = vbslq_f32( mask, x, maxDot);
  1206             index = vbslq_u32(mask, local_index, index);
  1207             local_index = vaddq_u32(local_index, four);
  1213             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1214             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1217             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1219             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1221             xy0 = vmulq_f32(xy0, vLo);
  1223             float32x4x2_t zb = vuzpq_f32( z0, z0);
  1224             float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1225             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
  1226             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1227             x = vaddq_f32(x, z);
  1229             uint32x4_t mask = vcgtq_f32(x, maxDot);
  1230             maxDot = vbslq_f32( mask, x, maxDot);
  1231             index = vbslq_u32(mask, local_index, index);
  1232             local_index = vaddq_u32(local_index, four);
  1238             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1241             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
  1243             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
  1245             xy0 = vmulq_f32(xy0, vLo);
  1247             z = vmulq_f32( z, vHi);
  1248             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
  1249             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1250             x = vaddq_f32(x, z);
  1252             uint32x4_t mask = vcgtq_f32(x, maxDot);
  1253             maxDot = vbslq_f32( mask, x, maxDot);
  1254             index = vbslq_u32(mask, local_index, index);
  1255             local_index = vaddq_u32(local_index, four);
  1265     uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
  1266     float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
  1267     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
  1270     float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
  1271     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
  1272     mask = vcgt_f32( maxDotO, maxDot2 );
  1273     maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
  1274     index2 = vbsl_u32(mask, indexHi, index2);
  1276     *dotResult = vget_lane_f32( maxDot2, 0);
  1277     return vget_lane_u32(index2, 0);
  1281 long _mindot_large_v0( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult )
  1283     unsigned long i = 0;
  1284     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
  1285     float32x2_t vLo = vget_low_f32(vvec);
  1286     float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
  1287     float32x2_t dotMinLo = (float32x2_t) { 
BT_INFINITY, BT_INFINITY };
  1288     float32x2_t dotMinHi = (float32x2_t) { 
BT_INFINITY, BT_INFINITY };
  1289     uint32x2_t indexLo = (uint32x2_t) {0, 1};
  1290     uint32x2_t indexHi = (uint32x2_t) {2, 3};
  1291     uint32x2_t iLo = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
  1292     uint32x2_t iHi = (uint32x2_t) {
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
  1293     const uint32x2_t four = (uint32x2_t) {4,4};
  1295     for( ; i+8 <= count; i+= 8 )
  1297         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1298         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1299         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1300         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1302         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1303         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1304         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1305         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
  1307         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1308         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  1309         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1310         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
  1312         float32x2_t rLo = vpadd_f32( xy0, xy1);
  1313         float32x2_t rHi = vpadd_f32( xy2, xy3);
  1314         rLo = vadd_f32(rLo, zLo);
  1315         rHi = vadd_f32(rHi, zHi);
  1317         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1318         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
  1319         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1320         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
  1321         iLo = vbsl_u32(maskLo, indexLo, iLo);
  1322         iHi = vbsl_u32(maskHi, indexHi, iHi);
  1323         indexLo = vadd_u32(indexLo, four);
  1324         indexHi = vadd_u32(indexHi, four);
  1326         v0 = vld1q_f32_aligned_postincrement( vv );
  1327         v1 = vld1q_f32_aligned_postincrement( vv );
  1328         v2 = vld1q_f32_aligned_postincrement( vv );
  1329         v3 = vld1q_f32_aligned_postincrement( vv );
  1331         xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1332         xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1333         xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1334         xy3 = vmul_f32( vget_low_f32(v3), vLo);
  1336         z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1337         z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  1338         zLo = vmul_f32( z0.val[0], vHi);
  1339         zHi = vmul_f32( z1.val[0], vHi);
  1341         rLo = vpadd_f32( xy0, xy1);
  1342         rHi = vpadd_f32( xy2, xy3);
  1343         rLo = vadd_f32(rLo, zLo);
  1344         rHi = vadd_f32(rHi, zHi);
  1346         maskLo = vclt_f32( rLo, dotMinLo );
  1347         maskHi = vclt_f32( rHi, dotMinHi );
  1348         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1349         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
  1350         iLo = vbsl_u32(maskLo, indexLo, iLo);
  1351         iHi = vbsl_u32(maskHi, indexHi, iHi);
  1352         indexLo = vadd_u32(indexLo, four);
  1353         indexHi = vadd_u32(indexHi, four);
  1356     for( ; i+4 <= count; i+= 4 )
  1358         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1359         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1360         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1361         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1363         float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1364         float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1365         float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1366         float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
  1368         float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1369         float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  1370         float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1371         float32x2_t zHi = vmul_f32( z1.val[0], vHi);
  1373         float32x2_t rLo = vpadd_f32( xy0, xy1);
  1374         float32x2_t rHi = vpadd_f32( xy2, xy3);
  1375         rLo = vadd_f32(rLo, zLo);
  1376         rHi = vadd_f32(rHi, zHi);
  1378         uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1379         uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
  1380         dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1381         dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
  1382         iLo = vbsl_u32(maskLo, indexLo, iLo);
  1383         iHi = vbsl_u32(maskHi, indexHi, iHi);
  1384         indexLo = vadd_u32(indexLo, four);
  1385         indexHi = vadd_u32(indexHi, four);
  1391             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1392             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1393             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1395             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1396             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1397             float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1399             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1400             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1401             float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
  1403             float32x2_t rLo = vpadd_f32( xy0, xy1);
  1404             float32x2_t rHi = vpadd_f32( xy2, xy2);
  1405             rLo = vadd_f32(rLo, zLo);
  1406             rHi = vadd_f32(rHi, zHi);
  1408             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1409             uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
  1410             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1411             dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
  1412             iLo = vbsl_u32(maskLo, indexLo, iLo);
  1413             iHi = vbsl_u32(maskHi, indexHi, iHi);
  1418             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1419             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1421             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1422             float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1424             float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1425             float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1427             float32x2_t rLo = vpadd_f32( xy0, xy1);
  1428             rLo = vadd_f32(rLo, zLo);
  1430             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1431             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1432             iLo = vbsl_u32(maskLo, indexLo, iLo);
  1437             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1438             float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1439             float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
  1440             float32x2_t zLo = vmul_f32( z0, vHi);
  1441             float32x2_t rLo = vpadd_f32( xy0, xy0);
  1442             rLo = vadd_f32(rLo, zLo);
  1443             uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1444             dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1445             iLo = vbsl_u32(maskLo, indexLo, iLo);
  1454     uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
  1455     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
  1456     iLo = vbsl_u32(mask, iHi, iLo);
  1459     dotMinHi = vdup_lane_f32(dotMinLo, 1);
  1460     iHi = vdup_lane_u32(iLo, 1);
  1461     mask = vclt_f32( dotMinHi, dotMinLo );
  1462     dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
  1463     iLo = vbsl_u32(mask, iHi, iLo);
  1465     *dotResult = vget_lane_f32( dotMinLo, 0);
  1466     return vget_lane_u32(iLo, 0);
  1469 long _mindot_large_v1( 
const float *vv, 
const float *vec, 
unsigned long count, 
float *dotResult )
  1471     float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
  1472     float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
  1473     float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
  1474     const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
  1475     uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
  1476     uint32x4_t index = (uint32x4_t) { 
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 
static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
  1479     unsigned long i = 0;
  1480     for( ; i + 8 <= count; i += 8 )
  1482         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1483         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1484         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1485         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1488         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1489         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1491         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1492         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1494         xy0 = vmulq_f32(xy0, vLo);
  1495         xy1 = vmulq_f32(xy1, vLo);
  1497         float32x4x2_t zb = vuzpq_f32( z0, z1);
  1498         float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1499         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1500         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1501         x = vaddq_f32(x, z);
  1503         uint32x4_t mask = vcltq_f32(x, minDot);
  1504         minDot = vbslq_f32( mask, x, minDot);
  1505         index = vbslq_u32(mask, local_index, index);
  1506         local_index = vaddq_u32(local_index, four);
  1508         v0 = vld1q_f32_aligned_postincrement( vv );
  1509         v1 = vld1q_f32_aligned_postincrement( vv );
  1510         v2 = vld1q_f32_aligned_postincrement( vv );
  1511         v3 = vld1q_f32_aligned_postincrement( vv );
  1514         xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1515         xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1517         z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1518         z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1520         xy0 = vmulq_f32(xy0, vLo);
  1521         xy1 = vmulq_f32(xy1, vLo);
  1523         zb = vuzpq_f32( z0, z1);
  1524         z = vmulq_f32( zb.val[0], vHi);
  1525         xy = vuzpq_f32( xy0, xy1);
  1526         x = vaddq_f32(xy.val[0], xy.val[1]);
  1527         x = vaddq_f32(x, z);
  1529         mask = vcltq_f32(x, minDot);
  1530         minDot = vbslq_f32( mask, x, minDot);
  1531         index = vbslq_u32(mask, local_index, index);
  1532         local_index = vaddq_u32(local_index, four);
  1535     for( ; i + 4 <= count; i += 4 )
  1537         float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1538         float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1539         float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1540         float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1543         float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1544         float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1546         float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1547         float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1549         xy0 = vmulq_f32(xy0, vLo);
  1550         xy1 = vmulq_f32(xy1, vLo);
  1552         float32x4x2_t zb = vuzpq_f32( z0, z1);
  1553         float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1554         float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1555         float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1556         x = vaddq_f32(x, z);
  1558         uint32x4_t mask = vcltq_f32(x, minDot);
  1559         minDot = vbslq_f32( mask, x, minDot);
  1560         index = vbslq_u32(mask, local_index, index);
  1561         local_index = vaddq_u32(local_index, four);
  1564     switch (count & 3) {
  1567             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1568             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1569             float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1572             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1573             float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
  1575             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1576             float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
  1578             xy0 = vmulq_f32(xy0, vLo);
  1579             xy1 = vmulq_f32(xy1, vLo);
  1581             float32x4x2_t zb = vuzpq_f32( z0, z1);
  1582             float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1583             float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1584             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1585             x = vaddq_f32(x, z);
  1587             uint32x4_t mask = vcltq_f32(x, minDot);
  1588             minDot = vbslq_f32( mask, x, minDot);
  1589             index = vbslq_u32(mask, local_index, index);
  1590             local_index = vaddq_u32(local_index, four);
  1596             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1597             float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1600             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1602             float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1604             xy0 = vmulq_f32(xy0, vLo);
  1606             float32x4x2_t zb = vuzpq_f32( z0, z0);
  1607             float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1608             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
  1609             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1610             x = vaddq_f32(x, z);
  1612             uint32x4_t mask = vcltq_f32(x, minDot);
  1613             minDot = vbslq_f32( mask, x, minDot);
  1614             index = vbslq_u32(mask, local_index, index);
  1615             local_index = vaddq_u32(local_index, four);
  1621             float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1624             float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
  1626             float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0); 
  1628             xy0 = vmulq_f32(xy0, vLo);
  1630             z = vmulq_f32( z, vHi);
  1631             float32x4x2_t xy = vuzpq_f32( xy0, xy0);
  1632             float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1633             x = vaddq_f32(x, z);
  1635             uint32x4_t mask = vcltq_f32(x, minDot);
  1636             minDot = vbslq_f32( mask, x, minDot);
  1637             index = vbslq_u32(mask, local_index, index);
  1638             local_index = vaddq_u32(local_index, four);
  1648     uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
  1649     float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
  1650     uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
  1653     float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
  1654     uint32x2_t indexHi = vdup_lane_u32(index2, 1);
  1655     mask = vclt_f32( minDotO, minDot2 );
  1656     minDot2 = vbsl_f32(mask, minDotO, minDot2);
  1657     index2 = vbsl_u32(mask, indexHi, index2);
  1659     *dotResult = vget_lane_f32( minDot2, 0);
  1660     return vget_lane_u32(index2, 0);
  1665     #error Unhandled __APPLE__ arch