15 #ifndef BT_SIMD__QUATERNION_H_ 
   16 #define BT_SIMD__QUATERNION_H_ 
   21 #ifdef BT_USE_DOUBLE_PRECISION 
   22 #define btQuaternionData btQuaternionDoubleData 
   23 #define btQuaternionDataName "btQuaternionDoubleData" 
   25 #define btQuaternionData btQuaternionFloatData 
   26 #define btQuaternionDataName "btQuaternionFloatData" 
   27 #endif  //BT_USE_DOUBLE_PRECISION 
   32 #define vOnes (_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f)) 
   36 #if defined(BT_USE_SSE) 
   38 #define vQInv (_mm_set_ps(+0.0f, -0.0f, -0.0f, -0.0f)) 
   39 #define vPPPM (_mm_set_ps(-0.0f, +0.0f, +0.0f, +0.0f)) 
   41 #elif defined(BT_USE_NEON) 
   55 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON) 
   65                 mVec128 = rhs.mVec128;
 
   99 #ifndef BT_EULER_DEFAULT_ZYX 
  131                 setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
 
  132                                  cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
 
  133                                  sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
 
  134                                  cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
 
  151                 setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,   
 
  152                                  cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,   
 
  153                                  cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,   
 
  154                                  cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);  
 
  202 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  203                 mVec128 = _mm_add_ps(mVec128, q.mVec128);
 
  204 #elif defined(BT_USE_NEON) 
  205                 mVec128 = vaddq_f32(mVec128, q.mVec128);
 
  219 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  220                 mVec128 = _mm_sub_ps(mVec128, q.mVec128);
 
  221 #elif defined(BT_USE_NEON) 
  222                 mVec128 = vsubq_f32(mVec128, q.mVec128);
 
  236 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  237                 __m128 vs = _mm_load_ss(&s);  
 
  238                 vs = bt_pshufd_ps(vs, 0);     
 
  239                 mVec128 = _mm_mul_ps(mVec128, vs);
 
  240 #elif defined(BT_USE_NEON) 
  241                 mVec128 = vmulq_n_f32(mVec128, s);
 
  256 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  257                 __m128 vQ2 = q.get128();
 
  259                 __m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0, 1, 2, 0));
 
  260                 __m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0));
 
  264                 __m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 1));
 
  265                 __m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1));
 
  269                 B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2, 0, 1, 2));
 
  270                 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2));
 
  274                 mVec128 = bt_splat_ps(mVec128, 3);  
 
  275                 mVec128 = mVec128 * vQ2;            
 
  278                 mVec128 = mVec128 - B1;      
 
  279                 A1 = _mm_xor_ps(A1, vPPPM);  
 
  280                 mVec128 = mVec128 + A1;      
 
  282 #elif defined(BT_USE_NEON) 
  284                 float32x4_t vQ1 = mVec128;
 
  285                 float32x4_t vQ2 = q.get128();
 
  286                 float32x4_t A0, A1, B1, A2, B2, A3, B3;
 
  287                 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
 
  291                         tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  
 
  294                         tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  
 
  297                 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
 
  299                 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
 
  301                 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
 
  302                 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
 
  304                 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     
 
  305                 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  
 
  307                 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
 
  308                 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
 
  310                 A3 = vcombine_f32(vQ1zx, vQ1yz);  
 
  311                 B3 = vcombine_f32(vQ2yz, vQ2xz);  
 
  313                 A1 = vmulq_f32(A1, B1);
 
  314                 A2 = vmulq_f32(A2, B2);
 
  315                 A3 = vmulq_f32(A3, B3);                           
 
  316                 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);  
 
  318                 A1 = vaddq_f32(A1, A2);  
 
  319                 A0 = vsubq_f32(A0, A3);  
 
  322                 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
 
  323                 A0 = vaddq_f32(A0, A1);  
 
  339 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  342                 vd = _mm_mul_ps(mVec128, q.mVec128);
 
  344                 __m128 t = _mm_movehl_ps(vd, vd);
 
  345                 vd = _mm_add_ps(vd, t);
 
  346                 t = _mm_shuffle_ps(vd, vd, 0x55);
 
  347                 vd = _mm_add_ss(vd, t);
 
  349                 return _mm_cvtss_f32(vd);
 
  350 #elif defined(BT_USE_NEON) 
  351                 float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
 
  352                 float32x2_t 
x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));
 
  354                 return vget_lane_f32(
x, 0);
 
  387 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  390                 vd = _mm_mul_ps(mVec128, mVec128);
 
  392                 __m128 t = _mm_movehl_ps(vd, vd);
 
  393                 vd = _mm_add_ps(vd, t);
 
  394                 t = _mm_shuffle_ps(vd, vd, 0x55);
 
  395                 vd = _mm_add_ss(vd, t);
 
  397                 vd = _mm_sqrt_ss(vd);
 
  398                 vd = _mm_div_ss(vOnes, vd);
 
  399                 vd = bt_pshufd_ps(vd, 0);  
 
  400                 mVec128 = _mm_mul_ps(mVec128, vd);
 
  413 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  414                 __m128 vs = _mm_load_ss(&s);  
 
  415                 vs = bt_pshufd_ps(vs, 0x00);  
 
  418 #elif defined(BT_USE_NEON) 
  499 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  501 #elif defined(BT_USE_NEON) 
  502                 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv));
 
  513 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  515 #elif defined(BT_USE_NEON) 
  528 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  530 #elif defined(BT_USE_NEON) 
  542 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  544 #elif defined(BT_USE_NEON) 
  545                 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask));
 
  635 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  636         __m128 vQ1 = q1.get128();
 
  637         __m128 vQ2 = q2.get128();
 
  638         __m128 A0, A1, B1, A2, B2;
 
  640         A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0, 1, 2, 0));  
 
  641         B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0));  
 
  645         A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1));  
 
  646         B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1));  
 
  650         B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2));  
 
  651         B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2));  
 
  655         A0 = bt_splat_ps(vQ1, 3);  
 
  661         A1 = _mm_xor_ps(A1, vPPPM);  
 
  666 #elif defined(BT_USE_NEON) 
  668         float32x4_t vQ1 = q1.get128();
 
  669         float32x4_t vQ2 = q2.get128();
 
  670         float32x4_t A0, A1, B1, A2, B2, A3, B3;
 
  671         float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
 
  675                 tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  
 
  678                 tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  
 
  681         vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
 
  683         vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
 
  685         vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
 
  686         vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
 
  688         A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     
 
  689         B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  
 
  691         A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
 
  692         B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
 
  694         A3 = vcombine_f32(vQ1zx, vQ1yz);  
 
  695         B3 = vcombine_f32(vQ2yz, vQ2xz);  
 
  697         A1 = vmulq_f32(A1, B1);
 
  698         A2 = vmulq_f32(A2, B2);
 
  699         A3 = vmulq_f32(A3, B3);                           
 
  700         A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);  
 
  702         A1 = vaddq_f32(A1, A2);  
 
  703         A0 = vsubq_f32(A0, A3);  
 
  706         A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
 
  707         A0 = vaddq_f32(A0, A1);  
 
  713                 q1.
w() * q2.
x() + q1.
x() * q2.
w() + q1.
y() * q2.
z() - q1.
z() * q2.
y(),
 
  714                 q1.
w() * q2.
y() + q1.
y() * q2.
w() + q1.
z() * q2.
x() - q1.
x() * q2.
z(),
 
  715                 q1.
w() * q2.
z() + q1.
z() * q2.
w() + q1.
x() * q2.
y() - q1.
y() * q2.
x(),
 
  716                 q1.
w() * q2.
w() - q1.
x() * q2.
x() - q1.
y() * q2.
y() - q1.
z() * q2.
z());
 
  723 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  724         __m128 vQ1 = q.get128();
 
  725         __m128 vQ2 = w.get128();
 
  726         __m128 A1, B1, A2, B2, A3, B3;
 
  728         A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3, 3, 3, 0));
 
  729         B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0, 1, 2, 0));
 
  733         A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1));
 
  734         B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1));
 
  738         A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2));
 
  739         B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2));
 
  744         A1 = _mm_xor_ps(A1, vPPPM);  
 
  749 #elif defined(BT_USE_NEON) 
  751         float32x4_t vQ1 = q.get128();
 
  752         float32x4_t vQ2 = w.get128();
 
  753         float32x4_t A1, B1, A2, B2, A3, B3;
 
  754         float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
 
  756         vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1);
 
  760                 tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  
 
  763                 tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  
 
  767         vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
 
  769         vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
 
  770         vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
 
  772         A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx);  
 
  773         B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);                     
 
  775         A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
 
  776         B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
 
  778         A3 = vcombine_f32(vQ1zx, vQ1yz);  
 
  779         B3 = vcombine_f32(vQ2yz, vQ2xz);  
 
  781         A1 = vmulq_f32(A1, B1);
 
  782         A2 = vmulq_f32(A2, B2);
 
  783         A3 = vmulq_f32(A3, B3);  
 
  785         A1 = vaddq_f32(A1, A2);  
 
  788         A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
 
  790         A1 = vsubq_f32(A1, A3);  
 
  796                 q.
w() * w.
x() + q.
y() * w.
z() - q.
z() * w.
y(),
 
  797                 q.
w() * w.
y() + q.
z() * w.
x() - q.
x() * w.
z(),
 
  798                 q.
w() * w.
z() + q.
x() * w.
y() - q.
y() * w.
x(),
 
  799                 -q.
x() * w.
x() - q.
y() * w.
y() - q.
z() * w.
z());
 
  806 #if defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  807         __m128 vQ1 = w.get128();
 
  808         __m128 vQ2 = q.get128();
 
  809         __m128 A1, B1, A2, B2, A3, B3;
 
  811         A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0, 1, 2, 0));  
 
  812         B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3, 3, 3, 0));  
 
  816         A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1, 2, 0, 1));
 
  817         B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2, 0, 1, 1));
 
  821         A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2, 0, 1, 2));
 
  822         B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1, 2, 0, 2));
 
  827         A1 = _mm_xor_ps(A1, vPPPM);  
 
  832 #elif defined(BT_USE_NEON) 
  834         float32x4_t vQ1 = w.get128();
 
  835         float32x4_t vQ2 = q.get128();
 
  836         float32x4_t A1, B1, A2, B2, A3, B3;
 
  837         float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
 
  842                 tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  
 
  845                 tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  
 
  848         vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
 
  850         vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
 
  852         vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
 
  853         vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
 
  855         A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     
 
  856         B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  
 
  858         A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
 
  859         B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
 
  861         A3 = vcombine_f32(vQ1zx, vQ1yz);  
 
  862         B3 = vcombine_f32(vQ2yz, vQ2xz);  
 
  864         A1 = vmulq_f32(A1, B1);
 
  865         A2 = vmulq_f32(A2, B2);
 
  866         A3 = vmulq_f32(A3, B3);  
 
  868         A1 = vaddq_f32(A1, A2);  
 
  871         A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
 
  873         A1 = vsubq_f32(A1, A3);  
 
  879                 +w.
x() * q.
w() + w.
y() * q.
z() - w.
z() * q.
y(),
 
  880                 +w.
y() * q.
w() + w.
z() * q.
x() - w.
x() * q.
z(),
 
  881                 +w.
z() * q.
w() + w.
x() * q.
y() - w.
y() * q.
x(),
 
  882                 -w.
x() * q.
x() - w.
y() * q.
y() - w.
z() * q.
z());
 
  922         return q1.
slerp(q2, t);
 
  930 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE) 
  931         return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
 
  932 #elif defined(BT_USE_NEON) 
  933         return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));
 
  979         for (
int i = 0; i < 4; i++)
 
  985         for (
int i = 0; i < 4; i++)
 
  992         for (
int i = 0; i < 4; i++)
 
  998         for (
int i = 0; i < 4; i++)
 
 1005         for (
int i = 0; i < 4; i++)
 
 1011         for (
int i = 0; i < 4; i++)
 
 1017         for (
int i = 0; i < 4; i++)
 
 1021 #endif  //BT_SIMD__QUATERNION_H_