LILAC
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
inline_trig.h
Go to the documentation of this file.
1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
2 
3  Inspired by Intel Approximate Math library, and based on the
4  corresponding algorithms of the cephes math library
5 
6  The default is to use the SSE1 version. If you define USE_SSE2 the
7  the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8  not expect any significant performance improvement with SSE2.
9 */
10 
11 /* Copyright (C) 2007 Julien Pommier
12 
13  This software is provided 'as-is', without any express or implied
14  warranty. In no event will the authors be held liable for any damages
15  arising from the use of this software.
16 
17  Permission is granted to anyone to use this software for any purpose,
18  including commercial applications, and to alter it and redistribute it
19  freely, subject to the following restrictions:
20 
21  1. The origin of this software must not be misrepresented; you must not
22  claim that you wrote the original software. If you use this software
23  in a product, an acknowledgment in the product documentation would be
24  appreciated but is not required.
25  2. Altered source versions must be plainly marked as such, and must not be
26  misrepresented as being the original software.
27  3. This notice may not be removed or altered from any source distribution.
28 
29  (this is the zlib license)
30 */
31 
32 #include <xmmintrin.h>
33 #define USE_SSE2
34 /* yes I know, the top of this file is quite ugly */
35 
36 #ifdef _MSC_VER /* visual c++ */
37 # define ALIGN16_BEG __declspec(align(16))
38 # define ALIGN16_END
39 #else /* gcc or icc */
40 # define ALIGN16_BEG
41 # define ALIGN16_END __attribute__((aligned(16)))
42 #endif
43 
44 /* __m128 is ugly to write */
45 typedef __m128 v4sf; // vector of 4 float (sse1)
46 
47 #ifdef USE_SSE2
48 # include <emmintrin.h>
49 typedef __m128i v4si; // vector of 4 int (sse2)
50 #else
51 typedef __m64 v2si; // vector of 2 int (mmx)
52 #endif
53 
54 /* declare some SSE constants -- why can't I figure a better way to do that? */
55 #define _PS_CONST(Name, Val) \
56  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
57 #define _PI32_CONST(Name, Val) \
58  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
59 #define _PS_CONST_TYPE(Name, Type, Val) \
60  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
61 
62 _PS_CONST(1 , 1.0f);
63 _PS_CONST(0p5, 0.5f);
64 /* the smallest non denormalized float number */
65 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
66 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
67 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
68 
69 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
70 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
71 
72 _PI32_CONST(1, 1);
73 _PI32_CONST(inv1, ~1);
74 _PI32_CONST(2, 2);
75 _PI32_CONST(4, 4);
76 _PI32_CONST(0x7f, 0x7f);
77 
78 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
79 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
80 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
81 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
82 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
83 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
84 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
85 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
86 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
87 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
88 _PS_CONST(cephes_log_q1, -2.12194440e-4);
89 _PS_CONST(cephes_log_q2, 0.693359375);
90 
91 #ifndef USE_SSE2
92 typedef union xmm_mm_union {
93  __m128 xmm;
94  __m64 mm[2];
95 } xmm_mm_union;
96 
97 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
98  xmm_mm_union u; u.xmm = xmm_; \
99  mm0_ = u.mm[0]; \
100  mm1_ = u.mm[1]; \
101 }
102 
103 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
104  xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
105  }
106 
107 #endif // USE_SSE2
108 
109 /* natural logarithm computed for 4 simultaneous float
110  return NaN for x <= 0
111 */
113 #ifdef USE_SSE2
114  v4si emm0;
115 #else
116  v2si mm0, mm1;
117 #endif
118  v4sf one = *(v4sf*)_ps_1;
119 
120  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
121 
122  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
123 
124 #ifndef USE_SSE2
125  /* part 1: x = frexpf(x, &e); */
126  COPY_XMM_TO_MM(x, mm0, mm1);
127  mm0 = _mm_srli_pi32(mm0, 23);
128  mm1 = _mm_srli_pi32(mm1, 23);
129 #else
130  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
131 #endif
132  /* keep only the fractional part */
133  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
134  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
135 
136 #ifndef USE_SSE2
137  /* now e=mm0:mm1 contain the really base-2 exponent */
138  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
139  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
140  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
141  _mm_empty(); /* bye bye mmx */
142 #else
143  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
144  v4sf e = _mm_cvtepi32_ps(emm0);
145 #endif
146 
147  e = _mm_add_ps(e, one);
148 
149  /* part2:
150  if( x < SQRTHF ) {
151  e -= 1;
152  x = x + x - 1.0;
153  } else { x = x - 1.0; }
154  */
155  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
156  v4sf tmp = _mm_and_ps(x, mask);
157  x = _mm_sub_ps(x, one);
158  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
159  x = _mm_add_ps(x, tmp);
160 
161 
162  v4sf z = _mm_mul_ps(x,x);
163 
164  v4sf y = *(v4sf*)_ps_cephes_log_p0;
165  y = _mm_mul_ps(y, x);
166  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
167  y = _mm_mul_ps(y, x);
168  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
169  y = _mm_mul_ps(y, x);
170  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
171  y = _mm_mul_ps(y, x);
172  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
173  y = _mm_mul_ps(y, x);
174  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
175  y = _mm_mul_ps(y, x);
176  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
177  y = _mm_mul_ps(y, x);
178  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
179  y = _mm_mul_ps(y, x);
180  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
181  y = _mm_mul_ps(y, x);
182 
183  y = _mm_mul_ps(y, z);
184 
185 
186  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
187  y = _mm_add_ps(y, tmp);
188 
189 
190  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
191  y = _mm_sub_ps(y, tmp);
192 
193  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
194  x = _mm_add_ps(x, y);
195  x = _mm_add_ps(x, tmp);
196  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
197  return x;
198 }
199 
200 _PS_CONST(exp_hi, 88.3762626647949f);
201 _PS_CONST(exp_lo, -88.3762626647949f);
202 
203 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
204 _PS_CONST(cephes_exp_C1, 0.693359375);
205 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
206 
207 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
208 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
209 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
210 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
211 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
212 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
213 
215  v4sf tmp = _mm_setzero_ps(), fx;
216 #ifdef USE_SSE2
217  v4si emm0;
218 #else
219  v2si mm0, mm1;
220 #endif
221  v4sf one = *(v4sf*)_ps_1;
222 
223  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
224  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
225 
226  /* express exp(x) as exp(g + n*log(2)) */
227  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
228  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
229 
230  /* how to perform a floorf with SSE: just below */
231 #ifndef USE_SSE2
232  /* step 1 : cast to int */
233  tmp = _mm_movehl_ps(tmp, fx);
234  mm0 = _mm_cvttps_pi32(fx);
235  mm1 = _mm_cvttps_pi32(tmp);
236  /* step 2 : cast back to float */
237  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
238 #else
239  emm0 = _mm_cvttps_epi32(fx);
240  tmp = _mm_cvtepi32_ps(emm0);
241 #endif
242  /* if greater, substract 1 */
243  v4sf mask = _mm_cmpgt_ps(tmp, fx);
244  mask = _mm_and_ps(mask, one);
245  fx = _mm_sub_ps(tmp, mask);
246 
247  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
248  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
249  x = _mm_sub_ps(x, tmp);
250  x = _mm_sub_ps(x, z);
251 
252  z = _mm_mul_ps(x,x);
253 
254  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
255  y = _mm_mul_ps(y, x);
256  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
257  y = _mm_mul_ps(y, x);
258  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
259  y = _mm_mul_ps(y, x);
260  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
261  y = _mm_mul_ps(y, x);
262  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
263  y = _mm_mul_ps(y, x);
264  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
265  y = _mm_mul_ps(y, z);
266  y = _mm_add_ps(y, x);
267  y = _mm_add_ps(y, one);
268 
269  /* build 2^n */
270 #ifndef USE_SSE2
271  z = _mm_movehl_ps(z, fx);
272  mm0 = _mm_cvttps_pi32(fx);
273  mm1 = _mm_cvttps_pi32(z);
274  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
275  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
276  mm0 = _mm_slli_pi32(mm0, 23);
277  mm1 = _mm_slli_pi32(mm1, 23);
278 
279  v4sf pow2n;
280  COPY_MM_TO_XMM(mm0, mm1, pow2n);
281  _mm_empty();
282 #else
283  emm0 = _mm_cvttps_epi32(fx);
284  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
285  emm0 = _mm_slli_epi32(emm0, 23);
286  v4sf pow2n = _mm_castsi128_ps(emm0);
287 #endif
288  y = _mm_mul_ps(y, pow2n);
289  return y;
290 }
291 
292 _PS_CONST(minus_cephes_DP1, -0.78515625);
293 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
294 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
295 _PS_CONST(sincof_p0, -1.9515295891E-4);
296 _PS_CONST(sincof_p1, 8.3321608736E-3);
297 _PS_CONST(sincof_p2, -1.6666654611E-1);
298 _PS_CONST(coscof_p0, 2.443315711809948E-005);
299 _PS_CONST(coscof_p1, -1.388731625493765E-003);
300 _PS_CONST(coscof_p2, 4.166664568298827E-002);
301 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
302 
303 
304 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
305  it runs also on old athlons XPs and the pentium III of your grand
306  mother.
307 
308  The code is the exact rewriting of the cephes sinf function.
309  Precision is excellent as long as x < 8192 (I did not bother to
310  take into account the special handling they have for greater values
311  -- it does not return garbage for arguments over 8192, though, but
312  the extra precision is missing).
313 
314  Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
315  surprising but correct result.
316 
317  Performance is also surprisingly good, 1.33 times faster than the
318  macos vsinf SSE2 function, and 1.5 times faster than the
319  __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
320  too bad for an SSE1 function (with no special tuning) !
321  However the latter libraries probably have a much better handling of NaN,
322  Inf, denormalized and other special arguments..
323 
324  On my core 1 duo, the execution of this function takes approximately 95 cycles.
325 
326  From what I have observed on the experiments with Intel AMath lib, switching to an
327  SSE2 version would improve the perf by only 10%.
328 
329  Since it is based on SSE intrinsics, it has to be compiled at -O2 to
330  deliver full speed.
331 */
332 v4sf sin_ps(v4sf x) { // any x
333  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
334 
335 #ifdef USE_SSE2
336  v4si emm0, emm2;
337 #else
338  v2si mm0, mm1, mm2, mm3;
339 #endif
340  sign_bit = x;
341  /* take the absolute value */
342  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
343  /* extract the sign bit (upper one) */
344  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
345 
346  /* scale by 4/Pi */
347  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
348 
349 #ifdef USE_SSE2
350  /* store the integer part of y in mm0 */
351  emm2 = _mm_cvttps_epi32(y);
352  /* j=(j+1) & (~1) (see the cephes sources) */
353  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
354  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
355  y = _mm_cvtepi32_ps(emm2);
356 
357  /* get the swap sign flag */
358  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
359  emm0 = _mm_slli_epi32(emm0, 29);
360  /* get the polynom selection mask
361  there is one polynom for 0 <= x <= Pi/4
362  and another one for Pi/4<x<=Pi/2
363 
364  Both branches will be computed.
365  */
366  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
367  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
368 
369  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
370  v4sf poly_mask = _mm_castsi128_ps(emm2);
371  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
372 
373 #else
374  /* store the integer part of y in mm0:mm1 */
375  xmm2 = _mm_movehl_ps(xmm2, y);
376  mm2 = _mm_cvttps_pi32(y);
377  mm3 = _mm_cvttps_pi32(xmm2);
378  /* j=(j+1) & (~1) (see the cephes sources) */
379  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
380  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
381  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
382  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
383  y = _mm_cvtpi32x2_ps(mm2, mm3);
384  /* get the swap sign flag */
385  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
386  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
387  mm0 = _mm_slli_pi32(mm0, 29);
388  mm1 = _mm_slli_pi32(mm1, 29);
389  /* get the polynom selection mask */
390  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
391  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
392  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
393  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
394  v4sf swap_sign_bit, poly_mask;
395  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
396  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
397  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
398  _mm_empty(); /* good-bye mmx */
399 #endif
400 
401  /* The magic pass: "Extended precision modular arithmetic"
402  x = ((x - y * DP1) - y * DP2) - y * DP3; */
403  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
404  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
405  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
406  xmm1 = _mm_mul_ps(y, xmm1);
407  xmm2 = _mm_mul_ps(y, xmm2);
408  xmm3 = _mm_mul_ps(y, xmm3);
409  x = _mm_add_ps(x, xmm1);
410  x = _mm_add_ps(x, xmm2);
411  x = _mm_add_ps(x, xmm3);
412 
413  /* Evaluate the first polynom (0 <= x <= Pi/4) */
414  y = *(v4sf*)_ps_coscof_p0;
415  v4sf z = _mm_mul_ps(x,x);
416 
417  y = _mm_mul_ps(y, z);
418  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
419  y = _mm_mul_ps(y, z);
420  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
421  y = _mm_mul_ps(y, z);
422  y = _mm_mul_ps(y, z);
423  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
424  y = _mm_sub_ps(y, tmp);
425  y = _mm_add_ps(y, *(v4sf*)_ps_1);
426 
427  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
428 
429  v4sf y2 = *(v4sf*)_ps_sincof_p0;
430  y2 = _mm_mul_ps(y2, z);
431  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
432  y2 = _mm_mul_ps(y2, z);
433  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
434  y2 = _mm_mul_ps(y2, z);
435  y2 = _mm_mul_ps(y2, x);
436  y2 = _mm_add_ps(y2, x);
437 
438  /* select the correct result from the two polynoms */
439  xmm3 = poly_mask;
440  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
441  y = _mm_andnot_ps(xmm3, y);
442  y = _mm_add_ps(y,y2);
443  /* update the sign */
444  y = _mm_xor_ps(y, sign_bit);
445  return y;
446 }
447 
448 /* almost the same as sin_ps */
449 v4sf cos_ps(v4sf x) { // any x
450  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
451 #ifdef USE_SSE2
452  v4si emm0, emm2;
453 #else
454  v2si mm0, mm1, mm2, mm3;
455 #endif
456  /* take the absolute value */
457  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
458 
459  /* scale by 4/Pi */
460  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
461 
462 #ifdef USE_SSE2
463  /* store the integer part of y in mm0 */
464  emm2 = _mm_cvttps_epi32(y);
465  /* j=(j+1) & (~1) (see the cephes sources) */
466  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
467  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
468  y = _mm_cvtepi32_ps(emm2);
469 
470  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
471 
472  /* get the swap sign flag */
473  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
474  emm0 = _mm_slli_epi32(emm0, 29);
475  /* get the polynom selection mask */
476  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
477  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
478 
479  v4sf sign_bit = _mm_castsi128_ps(emm0);
480  v4sf poly_mask = _mm_castsi128_ps(emm2);
481 #else
482  /* store the integer part of y in mm0:mm1 */
483  xmm2 = _mm_movehl_ps(xmm2, y);
484  mm2 = _mm_cvttps_pi32(y);
485  mm3 = _mm_cvttps_pi32(xmm2);
486 
487  /* j=(j+1) & (~1) (see the cephes sources) */
488  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
489  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
490  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
491  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
492 
493  y = _mm_cvtpi32x2_ps(mm2, mm3);
494 
495 
496  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
497  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
498 
499  /* get the swap sign flag in mm0:mm1 and the
500  polynom selection mask in mm2:mm3 */
501 
502  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
503  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
504  mm0 = _mm_slli_pi32(mm0, 29);
505  mm1 = _mm_slli_pi32(mm1, 29);
506 
507  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
508  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
509 
510  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
511  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
512 
513  v4sf sign_bit, poly_mask;
514  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
515  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
516  _mm_empty(); /* good-bye mmx */
517 #endif
518  /* The magic pass: "Extended precision modular arithmetic"
519  x = ((x - y * DP1) - y * DP2) - y * DP3; */
520  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
521  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
522  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
523  xmm1 = _mm_mul_ps(y, xmm1);
524  xmm2 = _mm_mul_ps(y, xmm2);
525  xmm3 = _mm_mul_ps(y, xmm3);
526  x = _mm_add_ps(x, xmm1);
527  x = _mm_add_ps(x, xmm2);
528  x = _mm_add_ps(x, xmm3);
529 
530  /* Evaluate the first polynom (0 <= x <= Pi/4) */
531  y = *(v4sf*)_ps_coscof_p0;
532  v4sf z = _mm_mul_ps(x,x);
533 
534  y = _mm_mul_ps(y, z);
535  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
536  y = _mm_mul_ps(y, z);
537  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
538  y = _mm_mul_ps(y, z);
539  y = _mm_mul_ps(y, z);
540  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
541  y = _mm_sub_ps(y, tmp);
542  y = _mm_add_ps(y, *(v4sf*)_ps_1);
543 
544  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
545 
546  v4sf y2 = *(v4sf*)_ps_sincof_p0;
547  y2 = _mm_mul_ps(y2, z);
548  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
549  y2 = _mm_mul_ps(y2, z);
550  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
551  y2 = _mm_mul_ps(y2, z);
552  y2 = _mm_mul_ps(y2, x);
553  y2 = _mm_add_ps(y2, x);
554 
555  /* select the correct result from the two polynoms */
556  xmm3 = poly_mask;
557  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
558  y = _mm_andnot_ps(xmm3, y);
559  y = _mm_add_ps(y,y2);
560  /* update the sign */
561  y = _mm_xor_ps(y, sign_bit);
562 
563  return y;
564 }
565 
566 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
567  it is almost as fast, and gives you a free cosine with your sine */
568 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
569  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
570 #ifdef USE_SSE2
571  v4si emm0, emm2, emm4;
572 #else
573  v2si mm0, mm1, mm2, mm3, mm4, mm5;
574 #endif
575  sign_bit_sin = x;
576  /* take the absolute value */
577  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
578  /* extract the sign bit (upper one) */
579  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
580 
581  /* scale by 4/Pi */
582  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
583 
584 #ifdef USE_SSE2
585  /* store the integer part of y in emm2 */
586  emm2 = _mm_cvttps_epi32(y);
587 
588  /* j=(j+1) & (~1) (see the cephes sources) */
589  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
590  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
591  y = _mm_cvtepi32_ps(emm2);
592 
593  emm4 = emm2;
594 
595  /* get the swap sign flag for the sine */
596  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
597  emm0 = _mm_slli_epi32(emm0, 29);
598  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
599 
600  /* get the polynom selection mask for the sine*/
601  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
602  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
603  v4sf poly_mask = _mm_castsi128_ps(emm2);
604 #else
605  /* store the integer part of y in mm2:mm3 */
606  xmm3 = _mm_movehl_ps(xmm3, y);
607  mm2 = _mm_cvttps_pi32(y);
608  mm3 = _mm_cvttps_pi32(xmm3);
609 
610  /* j=(j+1) & (~1) (see the cephes sources) */
611  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
612  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
613  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
614  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
615 
616  y = _mm_cvtpi32x2_ps(mm2, mm3);
617 
618  mm4 = mm2;
619  mm5 = mm3;
620 
621  /* get the swap sign flag for the sine */
622  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
623  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
624  mm0 = _mm_slli_pi32(mm0, 29);
625  mm1 = _mm_slli_pi32(mm1, 29);
626  v4sf swap_sign_bit_sin;
627  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
628 
629  /* get the polynom selection mask for the sine */
630 
631  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
632  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
633  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
634  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
635  v4sf poly_mask;
636  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
637 #endif
638 
639  /* The magic pass: "Extended precision modular arithmetic"
640  x = ((x - y * DP1) - y * DP2) - y * DP3; */
641  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
642  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
643  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
644  xmm1 = _mm_mul_ps(y, xmm1);
645  xmm2 = _mm_mul_ps(y, xmm2);
646  xmm3 = _mm_mul_ps(y, xmm3);
647  x = _mm_add_ps(x, xmm1);
648  x = _mm_add_ps(x, xmm2);
649  x = _mm_add_ps(x, xmm3);
650 
651 #ifdef USE_SSE2
652  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
653  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
654  emm4 = _mm_slli_epi32(emm4, 29);
655  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
656 #else
657  /* get the sign flag for the cosine */
658  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
659  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
660  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
661  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
662  mm4 = _mm_slli_pi32(mm4, 29);
663  mm5 = _mm_slli_pi32(mm5, 29);
664  v4sf sign_bit_cos;
665  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
666  _mm_empty(); /* good-bye mmx */
667 #endif
668 
669  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
670 
671 
672  /* Evaluate the first polynom (0 <= x <= Pi/4) */
673  v4sf z = _mm_mul_ps(x,x);
674  y = *(v4sf*)_ps_coscof_p0;
675 
676  y = _mm_mul_ps(y, z);
677  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
678  y = _mm_mul_ps(y, z);
679  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
680  y = _mm_mul_ps(y, z);
681  y = _mm_mul_ps(y, z);
682  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
683  y = _mm_sub_ps(y, tmp);
684  y = _mm_add_ps(y, *(v4sf*)_ps_1);
685 
686  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
687 
688  v4sf y2 = *(v4sf*)_ps_sincof_p0;
689  y2 = _mm_mul_ps(y2, z);
690  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
691  y2 = _mm_mul_ps(y2, z);
692  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
693  y2 = _mm_mul_ps(y2, z);
694  y2 = _mm_mul_ps(y2, x);
695  y2 = _mm_add_ps(y2, x);
696 
697  /* select the correct result from the two polynoms */
698  xmm3 = poly_mask;
699  v4sf ysin2 = _mm_and_ps(xmm3, y2);
700  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
701  y2 = _mm_sub_ps(y2,ysin2);
702  y = _mm_sub_ps(y, ysin1);
703 
704  xmm1 = _mm_add_ps(ysin1,ysin2);
705  xmm2 = _mm_add_ps(y,y2);
706 
707  /* update the sign */
708  *s = _mm_xor_ps(xmm1, sign_bit_sin);
709  *c = _mm_xor_ps(xmm2, sign_bit_cos);
710 }
711 
712