15 #ifndef BT_MATRIX3x3_H 
   16 #define BT_MATRIX3x3_H 
   25 #define vMPPP (_mm_set_ps(+0.0f, +0.0f, +0.0f, -0.0f)) 
   28 #if defined(BT_USE_SSE) 
   29 #define v1000 (_mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f)) 
   30 #define v0100 (_mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f)) 
   31 #define v0010 (_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f)) 
   32 #elif defined(BT_USE_NEON) 
   38 #ifdef BT_USE_DOUBLE_PRECISION 
   39 #define btMatrix3x3Data btMatrix3x3DoubleData 
   41 #define btMatrix3x3Data btMatrix3x3FloatData 
   42 #endif  //BT_USE_DOUBLE_PRECISION 
   77 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
   95                 m_el[0].mVec128 = rhs.
m_el[0].mVec128;
 
   96                 m_el[1].mVec128 = rhs.
m_el[1].mVec128;
 
   97                 m_el[2].mVec128 = rhs.
m_el[2].mVec128;
 
  103                 m_el[0].mVec128 = m.
m_el[0].mVec128;
 
  104                 m_el[1].mVec128 = m.
m_el[1].mVec128;
 
  105                 m_el[2].mVec128 = m.
m_el[2].mVec128;
 
  115                 m_el[0] = other.
m_el[0];
 
  116                 m_el[1] = other.
m_el[1];
 
  117                 m_el[2] = other.
m_el[2];
 
  123                 m_el[0] = other.
m_el[0];
 
  124                 m_el[1] = other.
m_el[1];
 
  125                 m_el[2] = other.
m_el[2];
 
  135                 return btVector3(m_el[0][i], m_el[1][i], m_el[2][i]);
 
  183                 m_el[2].
setValue(m[2], m[6], m[10]);
 
  212 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  213                 __m128 vs, Q = q.get128();
 
  214                 __m128i Qi = btCastfTo128i(Q);
 
  217                 __m128 V11, V21, V31;
 
  218                 __m128 NQ = _mm_xor_ps(Q, btvMzeroMask);
 
  219                 __m128i NQi = btCastfTo128i(NQ);
 
  221                 V1 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 2, 3)));  
 
  222                 V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0, 0, 1, 3));                 
 
  223                 V3 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(2, 1, 0, 3)));  
 
  224                 V1 = _mm_xor_ps(V1, vMPPP);                                         
 
  226                 V11 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 1, 0, 3)));  
 
  227                 V21 = _mm_unpackhi_ps(Q, Q);                                         
 
  228                 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0, 2, 0, 3));                 
 
  234                 V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2, 3, 1, 3));                
 
  236                 V21 = _mm_xor_ps(V21, vMPPP);                                       
 
  237                 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3, 3, 1, 3));                
 
  238                 V31 = _mm_xor_ps(V31, vMPPP);                                       
 
  239                 Y = btCastiTo128f(_mm_shuffle_epi32(NQi, BT_SHUFFLE(3, 2, 0, 3)));  
 
  240                 Z = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 1, 3)));   
 
  242                 vs = _mm_load_ss(&s);
 
  250                 vs = bt_splat3_ps(vs, 0);
 
  264                 btScalar xs = q.
x() * s, ys = q.
y() * s, zs = q.
z() * s;
 
  265                 btScalar wx = q.
w() * xs, wy = q.
w() * ys, wz = q.
w() * zs;
 
  266                 btScalar xx = q.
x() * xs, xy = q.
x() * ys, xz = q.
x() * zs;
 
  267                 btScalar yy = q.
y() * ys, yz = q.
y() * zs, zz = q.
z() * zs;
 
  269                         btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
 
  270                         xy + wz, 
btScalar(1.0) - (xx + zz), yz - wx,
 
  271                         xz - wy, yz + wx, 
btScalar(1.0) - (xx + yy));
 
  282                 setEulerZYX(roll, pitch, yaw);
 
  308                 setValue(cj * ch, sj * sc - cs, sj * cc + ss,
 
  309                                  cj * sh, sj * ss + cc, sj * cs - sc,
 
  310                                  -sj, cj * si, cj * ci);
 
  316 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
  329 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
  331                         identityMatrix(v1000, v0100, v0010);
 
  339                 return identityMatrix;
 
  346 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  347                 __m128 v0 = m_el[0].mVec128;
 
  348                 __m128 v1 = m_el[1].mVec128;
 
  349                 __m128 v2 = m_el[2].mVec128;  
 
  350                 __m128* vm = (__m128*)m;
 
  353                 v2 = _mm_and_ps(v2, btvFFF0fMask);  
 
  355                 vT = _mm_unpackhi_ps(v0, v1);  
 
  356                 v0 = _mm_unpacklo_ps(v0, v1);  
 
  358                 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3));                    
 
  359                 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3));                    
 
  360                 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));  
 
  365 #elif defined(BT_USE_NEON) 
  367                 static const uint32x2_t zMask = (
const uint32x2_t){static_cast<uint32_t>(-1), 0};
 
  368                 float32x4_t* vm = (float32x4_t*)m;
 
  369                 float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128);               
 
  370                 float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f));  
 
  371                 float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
 
  372                 float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
 
  373                 float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
 
  374                 float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);  
 
  399 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
  400                 btScalar trace = m_el[0].
x() + m_el[1].
y() + m_el[2].
z();
 
  412                         temp.f[0] = m_el[2].
y() - m_el[1].
z();
 
  413                         temp.f[1] = m_el[0].
z() - m_el[2].
x();
 
  414                         temp.f[2] = m_el[1].
x() - m_el[0].
y();
 
  421                         if (m_el[0].x() < m_el[1].y())
 
  423                                 if (m_el[1].y() < m_el[2].z())
 
  438                                 if (m_el[0].x() < m_el[2].z())
 
  452                         x = m_el[i][i] - m_el[j][j] - m_el[k][k] + 
btScalar(1.0);
 
  454                         temp.f[3] = (m_el[k][j] - m_el[j][k]);
 
  455                         temp.f[j] = (m_el[j][i] + m_el[i][j]);
 
  456                         temp.f[k] = (m_el[k][i] + m_el[i][k]);
 
  467                 btScalar trace = m_el[0].
x() + m_el[1].
y() + m_el[2].
z();
 
  477                         temp[0] = ((m_el[2].
y() - m_el[1].
z()) * s);
 
  478                         temp[1] = ((m_el[0].
z() - m_el[2].
x()) * s);
 
  479                         temp[2] = ((m_el[1].
x() - m_el[0].
y()) * s);
 
  483                         int i = m_el[0].
x() < m_el[1].
y() ? (m_el[1].
y() < m_el[2].
z() ? 2 : 1) : (m_el[0].x() < m_el[2].
z() ? 2 : 0);
 
  491                         temp[3] = (m_el[k][j] - m_el[j][k]) * s;
 
  492                         temp[j] = (m_el[j][i] + m_el[i][j]) * s;
 
  493                         temp[k] = (m_el[k][i] + m_el[i][k]) * s;
 
  495                 q.
setValue(temp[0], temp[1], temp[2], temp[3]);
 
  544                 if (
btFabs(m_el[2].x()) >= 1)
 
  555                                 euler_out.roll = euler_out.pitch + delta;
 
  556                                 euler_out2.roll = euler_out.pitch + delta;
 
  562                                 euler_out.roll = -euler_out.pitch + delta;
 
  563                                 euler_out2.roll = -euler_out.pitch + delta;
 
  568                         euler_out.pitch = -
btAsin(m_el[2].x());
 
  569                         euler_out2.pitch = 
SIMD_PI - euler_out.pitch;
 
  571                         euler_out.roll = 
btAtan2(m_el[2].y() / 
btCos(euler_out.pitch),
 
  572                                                                          m_el[2].
z() / 
btCos(euler_out.pitch));
 
  573                         euler_out2.roll = 
btAtan2(m_el[2].y() / 
btCos(euler_out2.pitch),
 
  574                                                                           m_el[2].
z() / 
btCos(euler_out2.pitch));
 
  576                         euler_out.yaw = 
btAtan2(m_el[1].x() / 
btCos(euler_out.pitch),
 
  577                                                                         m_el[0].
x() / 
btCos(euler_out.pitch));
 
  578                         euler_out2.yaw = 
btAtan2(m_el[1].x() / 
btCos(euler_out2.pitch),
 
  579                                                                          m_el[0].
x() / 
btCos(euler_out2.pitch));
 
  582                 if (solution_number == 1)
 
  585                         pitch = euler_out.pitch;
 
  586                         roll = euler_out.roll;
 
  590                         yaw = euler_out2.yaw;
 
  591                         pitch = euler_out2.pitch;
 
  592                         roll = euler_out2.roll;
 
  601 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
  602                 return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
 
  605                         m_el[0].x() * s.
x(), m_el[0].
y() * s.y(), m_el[0].
z() * s.z(),
 
  606                         m_el[1].
x() * s.x(), m_el[1].
y() * s.y(), m_el[1].
z() * s.z(),
 
  607                         m_el[2].
x() * s.x(), m_el[2].
y() * s.y(), m_el[2].
z() * s.z());
 
  648                 return m_el[0].
x() * v.
x() + m_el[1].
x() * v.
y() + m_el[2].
x() * v.
z();
 
  652                 return m_el[0].
y() * v.
x() + m_el[1].
y() * v.
y() + m_el[2].
y() * v.
z();
 
  656                 return m_el[0].
z() * v.
x() + m_el[1].
z() * v.
y() + m_el[2].
z() * v.
z();
 
  670                 for (iter = 0; iter < maxIter; iter++)
 
  696                 for (
int step = maxSteps; step > 0; step--)
 
  731                         btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
 
  737                                 t = (theta >= 0) ? 1 / (theta + 
btSqrt(1 + theta2))
 
  738                                                                  : 1 / (theta - 
btSqrt(1 + theta2));
 
  739                                 cos = 1 / 
btSqrt(1 + t * t);
 
  745                                 t = 1 / (theta * (2 + 
btScalar(0.5) / theta2));
 
  751                         m_el[p][q] = m_el[q][p] = 0;
 
  752                         m_el[p][p] -= t * mpq;
 
  753                         m_el[q][q] += t * mpq;
 
  756                         m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
 
  757                         m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
 
  760                         for (
int i = 0; i < 3; i++)
 
  765                                 row[p] = cos * mrp - sin * mrq;
 
  766                                 row[q] = cos * mrq + sin * mrp;
 
  780                 return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
 
  797 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  798         __m128 rv00, rv01, rv02;
 
  799         __m128 rv10, rv11, rv12;
 
  800         __m128 rv20, rv21, rv22;
 
  801         __m128 mv0, mv1, mv2;
 
  803         rv02 = 
m_el[0].mVec128;
 
  804         rv12 = 
m_el[1].mVec128;
 
  805         rv22 = 
m_el[2].mVec128;
 
  807         mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
 
  808         mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
 
  809         mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
 
  812         rv00 = bt_splat_ps(rv02, 0);
 
  813         rv01 = bt_splat_ps(rv02, 1);
 
  814         rv02 = bt_splat_ps(rv02, 2);
 
  816         rv00 = _mm_mul_ps(rv00, mv0);
 
  817         rv01 = _mm_mul_ps(rv01, mv1);
 
  818         rv02 = _mm_mul_ps(rv02, mv2);
 
  821         rv10 = bt_splat_ps(rv12, 0);
 
  822         rv11 = bt_splat_ps(rv12, 1);
 
  823         rv12 = bt_splat_ps(rv12, 2);
 
  825         rv10 = _mm_mul_ps(rv10, mv0);
 
  826         rv11 = _mm_mul_ps(rv11, mv1);
 
  827         rv12 = _mm_mul_ps(rv12, mv2);
 
  830         rv20 = bt_splat_ps(rv22, 0);
 
  831         rv21 = bt_splat_ps(rv22, 1);
 
  832         rv22 = bt_splat_ps(rv22, 2);
 
  834         rv20 = _mm_mul_ps(rv20, mv0);
 
  835         rv21 = _mm_mul_ps(rv21, mv1);
 
  836         rv22 = _mm_mul_ps(rv22, mv2);
 
  838         rv00 = _mm_add_ps(rv00, rv01);
 
  839         rv10 = _mm_add_ps(rv10, rv11);
 
  840         rv20 = _mm_add_ps(rv20, rv21);
 
  842         m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
 
  843         m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
 
  844         m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
 
  846 #elif defined(BT_USE_NEON) 
  848         float32x4_t rv0, rv1, rv2;
 
  849         float32x4_t v0, v1, v2;
 
  850         float32x4_t mv0, mv1, mv2;
 
  852         v0 = 
m_el[0].mVec128;
 
  853         v1 = 
m_el[1].mVec128;
 
  854         v2 = 
m_el[2].mVec128;
 
  856         mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
 
  857         mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
 
  858         mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
 
  860         rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
 
  861         rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
 
  862         rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
 
  864         rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
 
  865         rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
 
  866         rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
 
  868         rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
 
  869         rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
 
  870         rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
 
  872         m_el[0].mVec128 = rv0;
 
  873         m_el[1].mVec128 = rv1;
 
  874         m_el[2].mVec128 = rv2;
 
  887 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
  909 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) 
  910         __m128 vk = bt_splat_ps(_mm_load_ss((
float*)&k), 0x80);
 
  912                 _mm_mul_ps(m[0].mVec128, vk),
 
  913                 _mm_mul_ps(m[1].mVec128, vk),
 
  914                 _mm_mul_ps(m[2].mVec128, vk));
 
  915 #elif defined(BT_USE_NEON) 
  917                 vmulq_n_f32(m[0].mVec128, k),
 
  918                 vmulq_n_f32(m[1].mVec128, k),
 
  919                 vmulq_n_f32(m[2].mVec128, k));
 
  922                 m[0].x() * k, m[0].y() * k, m[0].z() * k,
 
  923                 m[1].x() * k, m[1].y() * k, m[1].z() * k,
 
  924                 m[2].x() * k, m[2].y() * k, m[2].z() * k);
 
  931 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
  933                 m1[0].mVec128 + m2[0].mVec128,
 
  934                 m1[1].mVec128 + m2[1].mVec128,
 
  935                 m1[2].mVec128 + m2[2].mVec128);
 
  948                 m1[2][2] + m2[2][2]);
 
  955 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
  957                 m1[0].mVec128 - m2[0].mVec128,
 
  958                 m1[1].mVec128 - m2[1].mVec128,
 
  959                 m1[2].mVec128 - m2[2].mVec128);
 
  972                 m1[2][2] - m2[2][2]);
 
  979 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
 1001         return btTriple((*
this)[0], (*
this)[1], (*
this)[2]);
 
 1007 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) 
 1009                 _mm_and_ps(
m_el[0].mVec128, btvAbsfMask),
 
 1010                 _mm_and_ps(
m_el[1].mVec128, btvAbsfMask),
 
 1011                 _mm_and_ps(
m_el[2].mVec128, btvAbsfMask));
 
 1012 #elif defined(BT_USE_NEON) 
 1014                 (float32x4_t)vandq_s32((int32x4_t)
m_el[0].mVec128, btv3AbsMask),
 
 1015                 (float32x4_t)vandq_s32((int32x4_t)
m_el[1].mVec128, btv3AbsMask),
 
 1016                 (float32x4_t)vandq_s32((int32x4_t)
m_el[2].mVec128, btv3AbsMask));
 
 1028 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) 
 1029         __m128 v0 = 
m_el[0].mVec128;
 
 1030         __m128 v1 = 
m_el[1].mVec128;
 
 1031         __m128 v2 = 
m_el[2].mVec128;  
 
 1034         v2 = _mm_and_ps(v2, btvFFF0fMask);  
 
 1036         vT = _mm_unpackhi_ps(v0, v1);  
 
 1037         v0 = _mm_unpacklo_ps(v0, v1);  
 
 1039         v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3));                    
 
 1040         v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3));                    
 
 1041         v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));  
 
 1044 #elif defined(BT_USE_NEON) 
 1046         static const uint32x2_t zMask = (
const uint32x2_t){static_cast<uint32_t>(-1), 0};
 
 1047         float32x4x2_t top = vtrnq_f32(
m_el[0].mVec128, 
m_el[1].mVec128);               
 
 1048         float32x2x2_t bl = vtrn_f32(vget_low_f32(
m_el[2].mVec128), vdup_n_f32(0.0f));  
 
 1049         float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
 
 1050         float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
 
 1051         float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(
m_el[2].mVec128), zMask);
 
 1052         float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);  
 
 1064         return btMatrix3x3(
cofac(1, 1, 2, 2), 
cofac(0, 2, 2, 1), 
cofac(0, 1, 1, 2),
 
 1065                                            cofac(1, 2, 2, 0), 
cofac(0, 0, 2, 2), 
cofac(0, 2, 1, 0),
 
 1066                                            cofac(1, 0, 2, 1), 
cofac(0, 1, 2, 0), 
cofac(0, 0, 1, 1));
 
 1072         btVector3 co(
cofac(1, 1, 2, 2), 
cofac(1, 2, 2, 0), 
cofac(1, 0, 2, 1));
 
 1078                                            co.
y() * s, 
cofac(0, 0, 2, 2) * s, 
cofac(0, 2, 1, 0) * s,
 
 1079                                            co.
z() * s, 
cofac(0, 1, 2, 0) * s, 
cofac(0, 0, 1, 1) * s);
 
 1085 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) 
 1088         __m128 row = 
m_el[0].mVec128;
 
 1089         __m128 m0 = _mm_and_ps(m.
getRow(0).mVec128, btvFFF0fMask);
 
 1090         __m128 m1 = _mm_and_ps(m.
getRow(1).mVec128, btvFFF0fMask);
 
 1091         __m128 m2 = _mm_and_ps(m.
getRow(2).mVec128, btvFFF0fMask);
 
 1092         __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
 
 1093         __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
 
 1094         __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
 
 1095         row = 
m_el[1].mVec128;
 
 1096         r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
 
 1097         r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
 
 1098         r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
 
 1099         row = 
m_el[2].mVec128;
 
 1100         r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
 
 1101         r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
 
 1102         r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
 
 1105 #elif defined BT_USE_NEON 
 1107         static const uint32x4_t xyzMask = (
const uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0};
 
 1108         float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.
getRow(0).mVec128, xyzMask);
 
 1109         float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.
getRow(1).mVec128, xyzMask);
 
 1110         float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.
getRow(2).mVec128, xyzMask);
 
 1111         float32x4_t row = 
m_el[0].mVec128;
 
 1112         float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0);
 
 1113         float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1);
 
 1114         float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0);
 
 1115         row = 
m_el[1].mVec128;
 
 1116         r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0);
 
 1117         r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1);
 
 1118         r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0);
 
 1119         row = 
m_el[2].mVec128;
 
 1120         r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0);
 
 1121         r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1);
 
 1122         r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0);
 
 1126                 m_el[0].x() * m[0].x() + 
m_el[1].x() * m[1].x() + 
m_el[2].x() * m[2].x(),
 
 1127                 m_el[0].x() * m[0].y() + 
m_el[1].x() * m[1].y() + 
m_el[2].x() * m[2].y(),
 
 1128                 m_el[0].x() * m[0].z() + 
m_el[1].x() * m[1].z() + 
m_el[2].x() * m[2].z(),
 
 1129                 m_el[0].y() * m[0].x() + 
m_el[1].y() * m[1].x() + 
m_el[2].y() * m[2].x(),
 
 1130                 m_el[0].y() * m[0].y() + 
m_el[1].y() * m[1].y() + 
m_el[2].y() * m[2].y(),
 
 1131                 m_el[0].y() * m[0].z() + 
m_el[1].y() * m[1].z() + 
m_el[2].y() * m[2].z(),
 
 1132                 m_el[0].z() * m[0].x() + 
m_el[1].z() * m[1].x() + 
m_el[2].z() * m[2].x(),
 
 1133                 m_el[0].z() * m[0].y() + 
m_el[1].z() * m[1].y() + 
m_el[2].z() * m[2].y(),
 
 1134                 m_el[0].z() * m[0].z() + 
m_el[1].z() * m[1].z() + 
m_el[2].z() * m[2].z());
 
 1141 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) 
 1142         __m128 a0 = 
m_el[0].mVec128;
 
 1143         __m128 a1 = 
m_el[1].mVec128;
 
 1144         __m128 a2 = 
m_el[2].mVec128;
 
 1147         __m128 mx = mT[0].mVec128;
 
 1148         __m128 my = mT[1].mVec128;
 
 1149         __m128 mz = mT[2].mVec128;
 
 1151         __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
 
 1152         __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
 
 1153         __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
 
 1154         r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
 
 1155         r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
 
 1156         r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
 
 1157         r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
 
 1158         r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
 
 1159         r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
 
 1162 #elif defined BT_USE_NEON 
 1163         float32x4_t a0 = 
m_el[0].mVec128;
 
 1164         float32x4_t a1 = 
m_el[1].mVec128;
 
 1165         float32x4_t a2 = 
m_el[2].mVec128;
 
 1168         float32x4_t mx = mT[0].mVec128;
 
 1169         float32x4_t my = mT[1].mVec128;
 
 1170         float32x4_t mz = mT[2].mVec128;
 
 1172         float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0);
 
 1173         float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0);
 
 1174         float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0);
 
 1175         r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1);
 
 1176         r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1);
 
 1177         r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1);
 
 1178         r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0);
 
 1179         r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0);
 
 1180         r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0);
 
 1194 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
 1195         return v.
dot3(m[0], m[1], m[2]);
 
 1204 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) 
 1206         const __m128 vv = v.mVec128;
 
 1208         __m128 c0 = bt_splat_ps(vv, 0);
 
 1209         __m128 c1 = bt_splat_ps(vv, 1);
 
 1210         __m128 c2 = bt_splat_ps(vv, 2);
 
 1212         c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask));
 
 1213         c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask));
 
 1214         c0 = _mm_add_ps(c0, c1);
 
 1215         c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask));
 
 1218 #elif defined(BT_USE_NEON) 
 1219         const float32x4_t vv = v.mVec128;
 
 1220         const float32x2_t vlo = vget_low_f32(vv);
 
 1221         const float32x2_t vhi = vget_high_f32(vv);
 
 1223         float32x4_t c0, c1, c2;
 
 1225         c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
 
 1226         c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
 
 1227         c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
 
 1229         c0 = vmulq_lane_f32(c0, vlo, 0);
 
 1230         c1 = vmulq_lane_f32(c1, vlo, 1);
 
 1231         c2 = vmulq_lane_f32(c2, vhi, 0);
 
 1232         c0 = vaddq_f32(c0, c1);
 
 1233         c0 = vaddq_f32(c0, c2);
 
 1244 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) 
 1246         __m128 m10 = m1[0].mVec128;
 
 1247         __m128 m11 = m1[1].mVec128;
 
 1248         __m128 m12 = m1[2].mVec128;
 
 1250         __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
 
 1252         __m128 c0 = bt_splat_ps(m10, 0);
 
 1253         __m128 c1 = bt_splat_ps(m11, 0);
 
 1254         __m128 c2 = bt_splat_ps(m12, 0);
 
 1256         c0 = _mm_mul_ps(c0, m2v);
 
 1257         c1 = _mm_mul_ps(c1, m2v);
 
 1258         c2 = _mm_mul_ps(c2, m2v);
 
 1260         m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
 
 1262         __m128 c0_1 = bt_splat_ps(m10, 1);
 
 1263         __m128 c1_1 = bt_splat_ps(m11, 1);
 
 1264         __m128 c2_1 = bt_splat_ps(m12, 1);
 
 1266         c0_1 = _mm_mul_ps(c0_1, m2v);
 
 1267         c1_1 = _mm_mul_ps(c1_1, m2v);
 
 1268         c2_1 = _mm_mul_ps(c2_1, m2v);
 
 1270         m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
 
 1272         c0 = _mm_add_ps(c0, c0_1);
 
 1273         c1 = _mm_add_ps(c1, c1_1);
 
 1274         c2 = _mm_add_ps(c2, c2_1);
 
 1276         m10 = bt_splat_ps(m10, 2);
 
 1277         m11 = bt_splat_ps(m11, 2);
 
 1278         m12 = bt_splat_ps(m12, 2);
 
 1280         m10 = _mm_mul_ps(m10, m2v);
 
 1281         m11 = _mm_mul_ps(m11, m2v);
 
 1282         m12 = _mm_mul_ps(m12, m2v);
 
 1284         c0 = _mm_add_ps(c0, m10);
 
 1285         c1 = _mm_add_ps(c1, m11);
 
 1286         c2 = _mm_add_ps(c2, m12);
 
 1290 #elif defined(BT_USE_NEON) 
 1292         float32x4_t rv0, rv1, rv2;
 
 1293         float32x4_t v0, v1, v2;
 
 1294         float32x4_t mv0, mv1, mv2;
 
 1300         mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
 
 1301         mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
 
 1302         mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
 
 1304         rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
 
 1305         rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
 
 1306         rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
 
 1308         rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
 
 1309         rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
 
 1310         rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
 
 1312         rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
 
 1313         rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
 
 1314         rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
 
 1345 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) 
 1349         c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
 
 1350         c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
 
 1351         c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
 
 1353         c0 = _mm_and_ps(c0, c1);
 
 1354         c0 = _mm_and_ps(c0, c2);
 
 1356         int m = _mm_movemask_ps((__m128)c0);
 
 1357         return (0x7 == (m & 0x7));
 
 1360         return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
 
 1361                         m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
 
 1362                         m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]);
 
 1380         for (
int i = 0; i < 3; i++)
 
 1386         for (
int i = 0; i < 3; i++)
 
 1392         for (
int i = 0; i < 3; i++)
 
 1398         for (
int i = 0; i < 3; i++)
 
 1404         for (
int i = 0; i < 3; i++)
 
 1408 #endif  //BT_MATRIX3x3_H