32 #include <xmmintrin.h>
37 # define ALIGN16_BEG __declspec(align(16))
41 # define ALIGN16_END __attribute__((aligned(16)))
48 # include <emmintrin.h>
55 #define _PS_CONST(Name, Val) \
56 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
57 #define _PI32_CONST(Name, Val) \
58 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
59 #define _PS_CONST_TYPE(Name, Type, Val) \
60 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
78 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
79 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
80 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
81 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
82 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
83 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
84 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
85 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
86 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
87 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
92 typedef union xmm_mm_union {
97 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
98 xmm_mm_union u; u.xmm = xmm_; \
103 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
104 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
120 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
122 x = _mm_max_ps(x, *(
v4sf*)_ps_min_norm_pos);
126 COPY_XMM_TO_MM(x, mm0, mm1);
127 mm0 = _mm_srli_pi32(mm0, 23);
128 mm1 = _mm_srli_pi32(mm1, 23);
130 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
133 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_mant_mask);
134 x = _mm_or_ps(x, *(
v4sf*)_ps_0p5);
138 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
139 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
140 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
143 emm0 = _mm_sub_epi32(emm0, *(
v4si*)_pi32_0x7f);
144 v4sf e = _mm_cvtepi32_ps(emm0);
147 e = _mm_add_ps(e, one);
155 v4sf mask = _mm_cmplt_ps(x, *(
v4sf*)_ps_cephes_SQRTHF);
156 v4sf tmp = _mm_and_ps(x, mask);
157 x = _mm_sub_ps(x, one);
158 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
159 x = _mm_add_ps(x, tmp);
162 v4sf z = _mm_mul_ps(x,x);
165 y = _mm_mul_ps(y, x);
166 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p1);
167 y = _mm_mul_ps(y, x);
168 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p2);
169 y = _mm_mul_ps(y, x);
170 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p3);
171 y = _mm_mul_ps(y, x);
172 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p4);
173 y = _mm_mul_ps(y, x);
174 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p5);
175 y = _mm_mul_ps(y, x);
176 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p6);
177 y = _mm_mul_ps(y, x);
178 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p7);
179 y = _mm_mul_ps(y, x);
180 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p8);
181 y = _mm_mul_ps(y, x);
183 y = _mm_mul_ps(y, z);
186 tmp = _mm_mul_ps(e, *(
v4sf*)_ps_cephes_log_q1);
187 y = _mm_add_ps(y, tmp);
190 tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
191 y = _mm_sub_ps(y, tmp);
193 tmp = _mm_mul_ps(e, *(
v4sf*)_ps_cephes_log_q2);
194 x = _mm_add_ps(x, y);
195 x = _mm_add_ps(x, tmp);
196 x = _mm_or_ps(x, invalid_mask);
203 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
205 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
207 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
208 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
209 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
210 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
211 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
212 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
215 v4sf tmp = _mm_setzero_ps(), fx;
223 x = _mm_min_ps(x, *(
v4sf*)_ps_exp_hi);
224 x = _mm_max_ps(x, *(
v4sf*)_ps_exp_lo);
227 fx = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_LOG2EF);
228 fx = _mm_add_ps(fx, *(
v4sf*)_ps_0p5);
233 tmp = _mm_movehl_ps(tmp, fx);
234 mm0 = _mm_cvttps_pi32(fx);
235 mm1 = _mm_cvttps_pi32(tmp);
237 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
239 emm0 = _mm_cvttps_epi32(fx);
240 tmp = _mm_cvtepi32_ps(emm0);
243 v4sf mask = _mm_cmpgt_ps(tmp, fx);
244 mask = _mm_and_ps(mask, one);
245 fx = _mm_sub_ps(tmp, mask);
247 tmp = _mm_mul_ps(fx, *(
v4sf*)_ps_cephes_exp_C1);
248 v4sf z = _mm_mul_ps(fx, *(
v4sf*)_ps_cephes_exp_C2);
249 x = _mm_sub_ps(x, tmp);
250 x = _mm_sub_ps(x, z);
255 y = _mm_mul_ps(y, x);
256 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p1);
257 y = _mm_mul_ps(y, x);
258 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p2);
259 y = _mm_mul_ps(y, x);
260 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p3);
261 y = _mm_mul_ps(y, x);
262 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p4);
263 y = _mm_mul_ps(y, x);
264 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p5);
265 y = _mm_mul_ps(y, z);
266 y = _mm_add_ps(y, x);
267 y = _mm_add_ps(y, one);
271 z = _mm_movehl_ps(z, fx);
272 mm0 = _mm_cvttps_pi32(fx);
273 mm1 = _mm_cvttps_pi32(z);
274 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
275 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
276 mm0 = _mm_slli_pi32(mm0, 23);
277 mm1 = _mm_slli_pi32(mm1, 23);
280 COPY_MM_TO_XMM(mm0, mm1, pow2n);
283 emm0 = _mm_cvttps_epi32(fx);
284 emm0 = _mm_add_epi32(emm0, *(
v4si*)_pi32_0x7f);
285 emm0 = _mm_slli_epi32(emm0, 23);
286 v4sf pow2n = _mm_castsi128_ps(emm0);
288 y = _mm_mul_ps(y, pow2n);
292 _PS_CONST(minus_cephes_DP1, -0.78515625);
293 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
294 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
298 _PS_CONST(coscof_p0, 2.443315711809948E-005);
299 _PS_CONST(coscof_p1, -1.388731625493765E-003);
300 _PS_CONST(coscof_p2, 4.166664568298827E-002);
301 _PS_CONST(cephes_FOPI, 1.27323954473516);
333 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
338 v2si mm0, mm1, mm2, mm3;
342 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_sign_mask);
344 sign_bit = _mm_and_ps(sign_bit, *(
v4sf*)_ps_sign_mask);
347 y = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_FOPI);
351 emm2 = _mm_cvttps_epi32(y);
353 emm2 = _mm_add_epi32(emm2, *(
v4si*)_pi32_1);
354 emm2 = _mm_and_si128(emm2, *(
v4si*)_pi32_inv1);
355 y = _mm_cvtepi32_ps(emm2);
358 emm0 = _mm_and_si128(emm2, *(
v4si*)_pi32_4);
359 emm0 = _mm_slli_epi32(emm0, 29);
366 emm2 = _mm_and_si128(emm2, *(
v4si*)_pi32_2);
367 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
369 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
370 v4sf poly_mask = _mm_castsi128_ps(emm2);
371 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
375 xmm2 = _mm_movehl_ps(xmm2, y);
376 mm2 = _mm_cvttps_pi32(y);
377 mm3 = _mm_cvttps_pi32(xmm2);
379 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
380 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
381 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
382 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
383 y = _mm_cvtpi32x2_ps(mm2, mm3);
385 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
386 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
387 mm0 = _mm_slli_pi32(mm0, 29);
388 mm1 = _mm_slli_pi32(mm1, 29);
390 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
391 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
392 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
393 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
394 v4sf swap_sign_bit, poly_mask;
395 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
396 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
397 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
403 xmm1 = *(
v4sf*)_ps_minus_cephes_DP1;
404 xmm2 = *(
v4sf*)_ps_minus_cephes_DP2;
405 xmm3 = *(
v4sf*)_ps_minus_cephes_DP3;
406 xmm1 = _mm_mul_ps(y, xmm1);
407 xmm2 = _mm_mul_ps(y, xmm2);
408 xmm3 = _mm_mul_ps(y, xmm3);
409 x = _mm_add_ps(x, xmm1);
410 x = _mm_add_ps(x, xmm2);
411 x = _mm_add_ps(x, xmm3);
414 y = *(
v4sf*)_ps_coscof_p0;
415 v4sf z = _mm_mul_ps(x,x);
417 y = _mm_mul_ps(y, z);
418 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p1);
419 y = _mm_mul_ps(y, z);
420 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p2);
421 y = _mm_mul_ps(y, z);
422 y = _mm_mul_ps(y, z);
423 v4sf tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
424 y = _mm_sub_ps(y, tmp);
425 y = _mm_add_ps(y, *(
v4sf*)_ps_1);
430 y2 = _mm_mul_ps(y2, z);
431 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p1);
432 y2 = _mm_mul_ps(y2, z);
433 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p2);
434 y2 = _mm_mul_ps(y2, z);
435 y2 = _mm_mul_ps(y2, x);
436 y2 = _mm_add_ps(y2, x);
440 y2 = _mm_and_ps(xmm3, y2);
441 y = _mm_andnot_ps(xmm3, y);
442 y = _mm_add_ps(y,y2);
444 y = _mm_xor_ps(y, sign_bit);
450 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
454 v2si mm0, mm1, mm2, mm3;
457 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_sign_mask);
460 y = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_FOPI);
464 emm2 = _mm_cvttps_epi32(y);
466 emm2 = _mm_add_epi32(emm2, *(
v4si*)_pi32_1);
467 emm2 = _mm_and_si128(emm2, *(
v4si*)_pi32_inv1);
468 y = _mm_cvtepi32_ps(emm2);
470 emm2 = _mm_sub_epi32(emm2, *(
v4si*)_pi32_2);
473 emm0 = _mm_andnot_si128(emm2, *(
v4si*)_pi32_4);
474 emm0 = _mm_slli_epi32(emm0, 29);
476 emm2 = _mm_and_si128(emm2, *(
v4si*)_pi32_2);
477 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
479 v4sf sign_bit = _mm_castsi128_ps(emm0);
480 v4sf poly_mask = _mm_castsi128_ps(emm2);
483 xmm2 = _mm_movehl_ps(xmm2, y);
484 mm2 = _mm_cvttps_pi32(y);
485 mm3 = _mm_cvttps_pi32(xmm2);
488 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
489 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
490 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
491 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
493 y = _mm_cvtpi32x2_ps(mm2, mm3);
496 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
497 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
502 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
503 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
504 mm0 = _mm_slli_pi32(mm0, 29);
505 mm1 = _mm_slli_pi32(mm1, 29);
507 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
508 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
510 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
511 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
513 v4sf sign_bit, poly_mask;
514 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
515 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
520 xmm1 = *(
v4sf*)_ps_minus_cephes_DP1;
521 xmm2 = *(
v4sf*)_ps_minus_cephes_DP2;
522 xmm3 = *(
v4sf*)_ps_minus_cephes_DP3;
523 xmm1 = _mm_mul_ps(y, xmm1);
524 xmm2 = _mm_mul_ps(y, xmm2);
525 xmm3 = _mm_mul_ps(y, xmm3);
526 x = _mm_add_ps(x, xmm1);
527 x = _mm_add_ps(x, xmm2);
528 x = _mm_add_ps(x, xmm3);
531 y = *(
v4sf*)_ps_coscof_p0;
532 v4sf z = _mm_mul_ps(x,x);
534 y = _mm_mul_ps(y, z);
535 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p1);
536 y = _mm_mul_ps(y, z);
537 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p2);
538 y = _mm_mul_ps(y, z);
539 y = _mm_mul_ps(y, z);
540 v4sf tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
541 y = _mm_sub_ps(y, tmp);
542 y = _mm_add_ps(y, *(
v4sf*)_ps_1);
547 y2 = _mm_mul_ps(y2, z);
548 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p1);
549 y2 = _mm_mul_ps(y2, z);
550 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p2);
551 y2 = _mm_mul_ps(y2, z);
552 y2 = _mm_mul_ps(y2, x);
553 y2 = _mm_add_ps(y2, x);
557 y2 = _mm_and_ps(xmm3, y2);
558 y = _mm_andnot_ps(xmm3, y);
559 y = _mm_add_ps(y,y2);
561 y = _mm_xor_ps(y, sign_bit);
569 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
571 v4si emm0, emm2, emm4;
573 v2si mm0, mm1, mm2, mm3, mm4, mm5;
577 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_sign_mask);
579 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(
v4sf*)_ps_sign_mask);
582 y = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_FOPI);
586 emm2 = _mm_cvttps_epi32(y);
589 emm2 = _mm_add_epi32(emm2, *(
v4si*)_pi32_1);
590 emm2 = _mm_and_si128(emm2, *(
v4si*)_pi32_inv1);
591 y = _mm_cvtepi32_ps(emm2);
596 emm0 = _mm_and_si128(emm2, *(
v4si*)_pi32_4);
597 emm0 = _mm_slli_epi32(emm0, 29);
598 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
601 emm2 = _mm_and_si128(emm2, *(
v4si*)_pi32_2);
602 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
603 v4sf poly_mask = _mm_castsi128_ps(emm2);
606 xmm3 = _mm_movehl_ps(xmm3, y);
607 mm2 = _mm_cvttps_pi32(y);
608 mm3 = _mm_cvttps_pi32(xmm3);
611 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
612 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
613 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
614 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
616 y = _mm_cvtpi32x2_ps(mm2, mm3);
622 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
623 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
624 mm0 = _mm_slli_pi32(mm0, 29);
625 mm1 = _mm_slli_pi32(mm1, 29);
626 v4sf swap_sign_bit_sin;
627 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
631 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
632 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
633 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
634 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
636 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
641 xmm1 = *(
v4sf*)_ps_minus_cephes_DP1;
642 xmm2 = *(
v4sf*)_ps_minus_cephes_DP2;
643 xmm3 = *(
v4sf*)_ps_minus_cephes_DP3;
644 xmm1 = _mm_mul_ps(y, xmm1);
645 xmm2 = _mm_mul_ps(y, xmm2);
646 xmm3 = _mm_mul_ps(y, xmm3);
647 x = _mm_add_ps(x, xmm1);
648 x = _mm_add_ps(x, xmm2);
649 x = _mm_add_ps(x, xmm3);
652 emm4 = _mm_sub_epi32(emm4, *(
v4si*)_pi32_2);
653 emm4 = _mm_andnot_si128(emm4, *(
v4si*)_pi32_4);
654 emm4 = _mm_slli_epi32(emm4, 29);
655 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
658 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
659 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
660 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
661 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
662 mm4 = _mm_slli_pi32(mm4, 29);
663 mm5 = _mm_slli_pi32(mm5, 29);
665 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
669 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
673 v4sf z = _mm_mul_ps(x,x);
674 y = *(
v4sf*)_ps_coscof_p0;
676 y = _mm_mul_ps(y, z);
677 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p1);
678 y = _mm_mul_ps(y, z);
679 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p2);
680 y = _mm_mul_ps(y, z);
681 y = _mm_mul_ps(y, z);
682 v4sf tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
683 y = _mm_sub_ps(y, tmp);
684 y = _mm_add_ps(y, *(
v4sf*)_ps_1);
689 y2 = _mm_mul_ps(y2, z);
690 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p1);
691 y2 = _mm_mul_ps(y2, z);
692 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p2);
693 y2 = _mm_mul_ps(y2, z);
694 y2 = _mm_mul_ps(y2, x);
695 y2 = _mm_add_ps(y2, x);
699 v4sf ysin2 = _mm_and_ps(xmm3, y2);
700 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
701 y2 = _mm_sub_ps(y2,ysin2);
702 y = _mm_sub_ps(y, ysin1);
704 xmm1 = _mm_add_ps(ysin1,ysin2);
705 xmm2 = _mm_add_ps(y,y2);
708 *s = _mm_xor_ps(xmm1, sign_bit_sin);
709 *c = _mm_xor_ps(xmm2, sign_bit_cos);