Audacity 3.2.0
SseMathFuncs.h
Go to the documentation of this file.
1/**********************************************************************
2
3 Audacity: A Digital Audio Editor
4
5 SseMathFuncs.h
6
7 Stephen Moshier (wrote original C version, The Cephes Library)
8 Julien Pommier (converted to use SSE)
9 Andrew Hallendorff (adapted for Audacity)
10
11*******************************************************************//****************************************************************/
17
18
19/* JKC: The trig functions use Taylor's series, on the range 0 to Pi/4
20 * computing both Sin and Cos, and using one or the other (in the
21 * solo functions), or both in the more useful for us for FFTs sincos
22 * function.
23 * The constants minus_cephes_DP1 to minus_cephes_DP3 are used in the
24 * angle reduction modulo function.
25 * 4 sincos are done at a time.
26 * If we wanted to do just sin or just cos, we could speed things up
27 * by queuing up the Sines and Cosines into batches of 4 separately.*/
28
29
30
31
32/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
33
34Inspired by Intel Approximate Math library, and based on the
35corresponding algorithms of the cephes math library
36
37The default is to use the SSE1 version. If you define USE_SSE2 the
38the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
39not expect any significant performance improvement with SSE2.
40*/
41
42/* Copyright (C) 2007 Julien Pommier
43
44This software is provided 'as-is', without any express or implied
45warranty. In no event will the authors be held liable for any damages
46arising from the use of this software.
47
48Permission is granted to anyone to use this software for any purpose,
49including commercial applications, and to alter it and redistribute it
50freely, subject to the following restrictions:
51
521. The origin of this software must not be misrepresented; you must not
53claim that you wrote the original software. If you use this software
54in a product, an acknowledgment in the product documentation would be
55appreciated but is not required.
562. Altered source versions must be plainly marked as such, and must not be
57misrepresented as being the original software.
583. This notice may not be removed or altered from any source distribution.
59
60(this is the zlib license)
61*/
62#ifndef SSE_MATHFUN
63#define SSE_MATHFUN
64/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
65
66 Inspired by Intel Approximate Math library, and based on the
67 corresponding algorithms of the cephes math library
68
69 The default is to use the SSE1 version. If you define USE_SSE2 the
70 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
71 not expect any significant performance improvement with SSE2.
72*/
73
74/* Copyright (C) 2007 Julien Pommier
75
76 This software is provided 'as-is', without any express or implied
77 warranty. In no event will the authors be held liable for any damages
78 arising from the use of this software.
79
80 Permission is granted to anyone to use this software for any purpose,
81 including commercial applications, and to alter it and redistribute it
82 freely, subject to the following restrictions:
83
84 1. The origin of this software must not be misrepresented; you must not
85 claim that you wrote the original software. If you use this software
86 in a product, an acknowledgment in the product documentation would be
87 appreciated but is not required.
88 2. Altered source versions must be plainly marked as such, and must not be
89 misrepresented as being the original software.
90 3. This notice may not be removed or altered from any source distribution.
91
92 (this is the zlib license)
93*/
94
95#include <xmmintrin.h>
96
97/* yes I know, the top of this file is quite ugly */
98
99#ifdef _MSC_VER /* visual c++ */
100# define ALIGN16_BEG __declspec(align(16))
101# define ALIGN16_END
102#else /* gcc or icc */
103# define ALIGN16_BEG
104# define ALIGN16_END __attribute__((aligned(16)))
105#endif
106
107/* __m128 is ugly to write */
108typedef __m128 v4sf; // vector of 4 float (sse1)
109
110#ifdef USE_SSE2
111# include <emmintrin.h>
112typedef __m128i v4si; // vector of 4 int (sse2)
113#else
114typedef __m64 v2si; // vector of 2 int (mmx)
115#endif
116
117/* declare some SSE constants -- why can't I figure a better way to do that? */
118#define _PS_CONST(Name, Val) \
119 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
120#define _PI32_CONST(Name, Val) \
121 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
122#define _PS_CONST_TYPE(Name, Type, Val) \
123 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
124
125_PS_CONST(1 , 1.0f);
126_PS_CONST(0p5, 0.5f);
127/* the smallest non denormalized float number */
128_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
129_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
130_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
131
132_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
133_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
134
136_PI32_CONST(inv1, ~1);
139_PI32_CONST(0x7f, 0x7f);
140
141_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
142_PS_CONST(cephes_log_p0, 7.0376836292E-2);
143_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
144_PS_CONST(cephes_log_p2, 1.1676998740E-1);
145_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
146_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
147_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
148_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
149_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
150_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
151_PS_CONST(cephes_log_q1, -2.12194440e-4);
152_PS_CONST(cephes_log_q2, 0.693359375);
153
154#ifndef USE_SSE2
155typedef union xmm_mm_union {
156 __m128 xmm;
157 __m64 mm[2];
159
160#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
161 xmm_mm_union u; u.xmm = xmm_; \
162 mm0_ = u.mm[0]; \
163 mm1_ = u.mm[1]; \
164}
165
166#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
167 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
168 }
169
170#endif // USE_SSE2
171
172/* natural logarithm computed for 4 simultaneous float
173 return NaN for x <= 0
174*/
176#ifdef USE_SSE2
177 v4si emm0;
178#else
179 v2si mm0, mm1;
180#endif
181 v4sf one = *(v4sf*)_ps_1;
182
183 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
184
185 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
186
187#ifndef USE_SSE2
188 /* part 1: x = frexpf(x, &e); */
189 COPY_XMM_TO_MM(x, mm0, mm1);
190 mm0 = _mm_srli_pi32(mm0, 23);
191 mm1 = _mm_srli_pi32(mm1, 23);
192#else
193 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
194#endif
195 /* keep only the fractional part */
196 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
197 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
198
199#ifndef USE_SSE2
200 /* now e=mm0:mm1 contain the really base-2 exponent */
201 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
202 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
203 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
204 _mm_empty(); /* bye bye mmx */
205#else
206 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
207 v4sf e = _mm_cvtepi32_ps(emm0);
208#endif
209
210 e = _mm_add_ps(e, one);
211
212 /* part2:
213 if( x < SQRTHF ) {
214 e -= 1;
215 x = x + x - 1.0;
216 } else { x = x - 1.0; }
217 */
218 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
219 v4sf tmp = _mm_and_ps(x, mask);
220 x = _mm_sub_ps(x, one);
221 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
222 x = _mm_add_ps(x, tmp);
223
224
225 v4sf z = _mm_mul_ps(x,x);
226
227 v4sf y = *(v4sf*)_ps_cephes_log_p0;
228 y = _mm_mul_ps(y, x);
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
230 y = _mm_mul_ps(y, x);
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
232 y = _mm_mul_ps(y, x);
233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
234 y = _mm_mul_ps(y, x);
235 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
236 y = _mm_mul_ps(y, x);
237 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
238 y = _mm_mul_ps(y, x);
239 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
240 y = _mm_mul_ps(y, x);
241 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
242 y = _mm_mul_ps(y, x);
243 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
244 y = _mm_mul_ps(y, x);
245
246 y = _mm_mul_ps(y, z);
247
248
249 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
250 y = _mm_add_ps(y, tmp);
251
252
253 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
254 y = _mm_sub_ps(y, tmp);
255
256 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
257 x = _mm_add_ps(x, y);
258 x = _mm_add_ps(x, tmp);
259 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
260 return x;
261}
262
263_PS_CONST(exp_hi, 88.3762626647949f);
264_PS_CONST(exp_lo, -88.3762626647949f);
265
266_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
267_PS_CONST(cephes_exp_C1, 0.693359375);
268_PS_CONST(cephes_exp_C2, -2.12194440e-4);
269
270_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
271_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
272_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
273_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
274_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
275_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
276
278 v4sf tmp = _mm_setzero_ps(), fx;
279#ifdef USE_SSE2
280 v4si emm0;
281#else
282 v2si mm0, mm1;
283#endif
284 v4sf one = *(v4sf*)_ps_1;
285
286 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
287 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
288
289 /* express exp(x) as exp(g + n*log(2)) */
290 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
291 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
292
293 /* how to perform a floorf with SSE: just below */
294#ifndef USE_SSE2
295 /* step 1 : cast to int */
296 tmp = _mm_movehl_ps(tmp, fx);
297 mm0 = _mm_cvttps_pi32(fx);
298 mm1 = _mm_cvttps_pi32(tmp);
299 /* step 2 : cast back to float */
300 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
301#else
302 emm0 = _mm_cvttps_epi32(fx);
303 tmp = _mm_cvtepi32_ps(emm0);
304#endif
305 /* if greater, subtract 1 */
306 v4sf mask = _mm_cmpgt_ps(tmp, fx);
307 mask = _mm_and_ps(mask, one);
308 fx = _mm_sub_ps(tmp, mask);
309
310 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
311 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
312 x = _mm_sub_ps(x, tmp);
313 x = _mm_sub_ps(x, z);
314
315 z = _mm_mul_ps(x,x);
316
317 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
318 y = _mm_mul_ps(y, x);
319 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
320 y = _mm_mul_ps(y, x);
321 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
322 y = _mm_mul_ps(y, x);
323 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
324 y = _mm_mul_ps(y, x);
325 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
326 y = _mm_mul_ps(y, x);
327 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
328 y = _mm_mul_ps(y, z);
329 y = _mm_add_ps(y, x);
330 y = _mm_add_ps(y, one);
331
332 /* build 2^n */
333#ifndef USE_SSE2
334 z = _mm_movehl_ps(z, fx);
335 mm0 = _mm_cvttps_pi32(fx);
336 mm1 = _mm_cvttps_pi32(z);
337 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
338 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
339 mm0 = _mm_slli_pi32(mm0, 23);
340 mm1 = _mm_slli_pi32(mm1, 23);
341
342 v4sf pow2n;
343 COPY_MM_TO_XMM(mm0, mm1, pow2n);
344 _mm_empty();
345#else
346 emm0 = _mm_cvttps_epi32(fx);
347 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
348 emm0 = _mm_slli_epi32(emm0, 23);
349 v4sf pow2n = _mm_castsi128_ps(emm0);
350#endif
351 y = _mm_mul_ps(y, pow2n);
352 return y;
353}
354
356_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
357_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
358_PS_CONST(sincof_p0, -1.9515295891E-4);
359_PS_CONST(sincof_p1, 8.3321608736E-3);
360_PS_CONST(sincof_p2, -1.6666654611E-1);
361_PS_CONST(coscof_p0, 2.443315711809948E-005);
362_PS_CONST(coscof_p1, -1.388731625493765E-003);
363_PS_CONST(coscof_p2, 4.166664568298827E-002);
364_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
365
366
367/* evaluation of 4 sines at once, using only SSE1+MMX intrinsics so
368 it runs also on old athlons XPs and the pentium III of your grand
369 mother.
370
371 The code is the exact rewriting of the cephes sinf function.
372 Precision is excellent as long as x < 8192 (I did not bother to
373 take into account the special handling they have for greater values
374 -- it does not return garbage for arguments over 8192, though, but
375 the extra precision is missing).
376
377 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
378 surprising but correct result.
379
380 Performance is also surprisingly good, 1.33 times faster than the
381 macos vsinf SSE2 function, and 1.5 times faster than the
382 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
383 too bad for an SSE1 function (with no special tuning) !
384 However the latter libraries probably have a much better handling of NaN,
385 Inf, denormalized and other special arguments..
386
387 On my core 1 duo, the execution of this function takes approximately 95 cycles.
388
389 From what I have observed on the experiments with Intel AMath lib, switching to an
390 SSE2 version would improve the perf by only 10%.
391
392 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
393 deliver full speed.
394*/
395v4sf sin_ps(v4sf x) { // any x
396 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
397
398#ifdef USE_SSE2
399 v4si emm0, emm2;
400#else
401 v2si mm0, mm1, mm2, mm3;
402#endif
403 sign_bit = x;
404 /* take the absolute value */
405 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
406 /* extract the sign bit (upper one) */
407 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
408
409 /* scale by 4/Pi */
410 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
411
412#ifdef USE_SSE2
413 /* store the integer part of y in mm0 */
414 emm2 = _mm_cvttps_epi32(y);
415 /* j=(j+1) & (~1) (see the cephes sources) */
416 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
417 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
418 y = _mm_cvtepi32_ps(emm2);
419
420 /* get the swap sign flag */
421 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
422 emm0 = _mm_slli_epi32(emm0, 29);
423 /* get the polynom selection mask
424 there is one polynom for 0 <= x <= Pi/4
425 and another one for Pi/4<x<=Pi/2
426
427 Both branches will be computed.
428 */
429 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
430 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
431
432 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
433 v4sf poly_mask = _mm_castsi128_ps(emm2);
434 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
435
436#else
437 /* store the integer part of y in mm0:mm1 */
438 xmm2 = _mm_movehl_ps(xmm2, y);
439 mm2 = _mm_cvttps_pi32(y);
440 mm3 = _mm_cvttps_pi32(xmm2);
441 /* j=(j+1) & (~1) (see the cephes sources) */
442 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
443 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
444 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
445 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
446 y = _mm_cvtpi32x2_ps(mm2, mm3);
447 /* get the swap sign flag */
448 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
449 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
450 mm0 = _mm_slli_pi32(mm0, 29);
451 mm1 = _mm_slli_pi32(mm1, 29);
452 /* get the polynom selection mask */
453 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
454 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
455 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
456 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
457 v4sf swap_sign_bit, poly_mask;
458 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
459 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
460 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
461 _mm_empty(); /* good-bye mmx */
462#endif
463
464 /* The magic pass: "Extended precision modular arithmetic"
465 x = ((x - y * DP1) - y * DP2) - y * DP3; */
466 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
467 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
468 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
469 xmm1 = _mm_mul_ps(y, xmm1);
470 xmm2 = _mm_mul_ps(y, xmm2);
471 xmm3 = _mm_mul_ps(y, xmm3);
472 x = _mm_add_ps(x, xmm1);
473 x = _mm_add_ps(x, xmm2);
474 x = _mm_add_ps(x, xmm3);
475
476 /* Evaluate the first polynom (0 <= x <= Pi/4) */
477 y = *(v4sf*)_ps_coscof_p0;
478 v4sf z = _mm_mul_ps(x,x);
479
480 y = _mm_mul_ps(y, z);
481 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
482 y = _mm_mul_ps(y, z);
483 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
484 y = _mm_mul_ps(y, z);
485 y = _mm_mul_ps(y, z);
486 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
487 y = _mm_sub_ps(y, tmp);
488 y = _mm_add_ps(y, *(v4sf*)_ps_1);
489
490 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
491
492 v4sf y2 = *(v4sf*)_ps_sincof_p0;
493 y2 = _mm_mul_ps(y2, z);
494 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
495 y2 = _mm_mul_ps(y2, z);
496 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
497 y2 = _mm_mul_ps(y2, z);
498 y2 = _mm_mul_ps(y2, x);
499 y2 = _mm_add_ps(y2, x);
500
501 /* select the correct result from the two polynoms */
502 xmm3 = poly_mask;
503 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
504 y = _mm_andnot_ps(xmm3, y);
505 y = _mm_add_ps(y,y2);
506 /* update the sign */
507 y = _mm_xor_ps(y, sign_bit);
508 return y;
509}
510
511/* almost the same as sin_ps */
512v4sf cos_ps(v4sf x) { // any x
513 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
514#ifdef USE_SSE2
515 v4si emm0, emm2;
516#else
517 v2si mm0, mm1, mm2, mm3;
518#endif
519 /* take the absolute value */
520 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
521
522 /* scale by 4/Pi */
523 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
524
525#ifdef USE_SSE2
526 /* store the integer part of y in mm0 */
527 emm2 = _mm_cvttps_epi32(y);
528 /* j=(j+1) & (~1) (see the cephes sources) */
529 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
530 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
531 y = _mm_cvtepi32_ps(emm2);
532
533 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
534
535 /* get the swap sign flag */
536 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
537 emm0 = _mm_slli_epi32(emm0, 29);
538 /* get the polynom selection mask */
539 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
540 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
541
542 v4sf sign_bit = _mm_castsi128_ps(emm0);
543 v4sf poly_mask = _mm_castsi128_ps(emm2);
544#else
545 /* store the integer part of y in mm0:mm1 */
546 xmm2 = _mm_movehl_ps(xmm2, y);
547 mm2 = _mm_cvttps_pi32(y);
548 mm3 = _mm_cvttps_pi32(xmm2);
549
550 /* j=(j+1) & (~1) (see the cephes sources) */
551 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
552 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
553 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
554 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
555
556 y = _mm_cvtpi32x2_ps(mm2, mm3);
557
558
559 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
560 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
561
562 /* get the swap sign flag in mm0:mm1 and the
563 polynom selection mask in mm2:mm3 */
564
565 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
566 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
567 mm0 = _mm_slli_pi32(mm0, 29);
568 mm1 = _mm_slli_pi32(mm1, 29);
569
570 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
571 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
572
573 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
574 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
575
576 v4sf sign_bit, poly_mask;
577 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
578 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
579 _mm_empty(); /* good-bye mmx */
580#endif
581 /* The magic pass: "Extended precision modular arithmetic"
582 x = ((x - y * DP1) - y * DP2) - y * DP3; */
583 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
584 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
585 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
586 xmm1 = _mm_mul_ps(y, xmm1);
587 xmm2 = _mm_mul_ps(y, xmm2);
588 xmm3 = _mm_mul_ps(y, xmm3);
589 x = _mm_add_ps(x, xmm1);
590 x = _mm_add_ps(x, xmm2);
591 x = _mm_add_ps(x, xmm3);
592
593 /* Evaluate the first polynom (0 <= x <= Pi/4) */
594 y = *(v4sf*)_ps_coscof_p0;
595 v4sf z = _mm_mul_ps(x,x);
596
597 y = _mm_mul_ps(y, z);
598 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
599 y = _mm_mul_ps(y, z);
600 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
601 y = _mm_mul_ps(y, z);
602 y = _mm_mul_ps(y, z);
603 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
604 y = _mm_sub_ps(y, tmp);
605 y = _mm_add_ps(y, *(v4sf*)_ps_1);
606
607 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
608
609 v4sf y2 = *(v4sf*)_ps_sincof_p0;
610 y2 = _mm_mul_ps(y2, z);
611 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
612 y2 = _mm_mul_ps(y2, z);
613 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
614 y2 = _mm_mul_ps(y2, z);
615 y2 = _mm_mul_ps(y2, x);
616 y2 = _mm_add_ps(y2, x);
617
618 /* select the correct result from the two polynoms */
619 xmm3 = poly_mask;
620 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
621 y = _mm_andnot_ps(xmm3, y);
622 y = _mm_add_ps(y,y2);
623 /* update the sign */
624 y = _mm_xor_ps(y, sign_bit);
625
626 return y;
627}
628
629/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
630 it is almost as fast, and gives you a free cosine with your sine */
631void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
632 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
633#ifdef USE_SSE2
634 v4si emm0, emm2, emm4;
635#else
636 v2si mm0, mm1, mm2, mm3, mm4, mm5;
637#endif
638 sign_bit_sin = x;
639 /* take the absolute value */
640 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
641 /* extract the sign bit (upper one) */
642 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
643
644 /* scale by 4/Pi */
645 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
646
647#ifdef USE_SSE2
648 /* store the integer part of y in emm2 */
649 emm2 = _mm_cvttps_epi32(y);
650
651 /* j=(j+1) & (~1) (see the cephes sources) */
652 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
653 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
654 y = _mm_cvtepi32_ps(emm2);
655
656 emm4 = emm2;
657
658 /* get the swap sign flag for the sine */
659 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
660 emm0 = _mm_slli_epi32(emm0, 29);
661 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
662
663 /* get the polynom selection mask for the sine*/
664 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
665 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
666 v4sf poly_mask = _mm_castsi128_ps(emm2);
667#else
668 /* store the integer part of y in mm2:mm3 */
669 xmm3 = _mm_movehl_ps(xmm3, y);
670 mm2 = _mm_cvttps_pi32(y);
671 mm3 = _mm_cvttps_pi32(xmm3);
672
673 /* j=(j+1) & (~1) (see the cephes sources) */
674 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
675 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
676 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
677 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
678
679 y = _mm_cvtpi32x2_ps(mm2, mm3);
680
681 mm4 = mm2;
682 mm5 = mm3;
683
684 /* get the swap sign flag for the sine */
685 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
686 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
687 mm0 = _mm_slli_pi32(mm0, 29);
688 mm1 = _mm_slli_pi32(mm1, 29);
689 v4sf swap_sign_bit_sin;
690 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
691
692 /* get the polynom selection mask for the sine */
693
694 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
695 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
696 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
697 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
698 v4sf poly_mask;
699 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
700#endif
701
702 /* The magic pass: "Extended precision modular arithmetic"
703 x = ((x - y * DP1) - y * DP2) - y * DP3; */
704 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
705 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
706 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
707 xmm1 = _mm_mul_ps(y, xmm1);
708 xmm2 = _mm_mul_ps(y, xmm2);
709 xmm3 = _mm_mul_ps(y, xmm3);
710 x = _mm_add_ps(x, xmm1);
711 x = _mm_add_ps(x, xmm2);
712 x = _mm_add_ps(x, xmm3);
713
714#ifdef USE_SSE2
715 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
716 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
717 emm4 = _mm_slli_epi32(emm4, 29);
718 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
719#else
720 /* get the sign flag for the cosine */
721 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
722 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
723 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
724 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
725 mm4 = _mm_slli_pi32(mm4, 29);
726 mm5 = _mm_slli_pi32(mm5, 29);
727 v4sf sign_bit_cos;
728 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
729 _mm_empty(); /* good-bye mmx */
730#endif
731
732 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
733
734
735 /* Evaluate the first polynom (0 <= x <= Pi/4) */
736 v4sf z = _mm_mul_ps(x,x);
737 y = *(v4sf*)_ps_coscof_p0;
738
739 y = _mm_mul_ps(y, z);
740 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
741 y = _mm_mul_ps(y, z);
742 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
743 y = _mm_mul_ps(y, z);
744 y = _mm_mul_ps(y, z);
745 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
746 y = _mm_sub_ps(y, tmp);
747 y = _mm_add_ps(y, *(v4sf*)_ps_1);
748
749 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
750
751 v4sf y2 = *(v4sf*)_ps_sincof_p0;
752 y2 = _mm_mul_ps(y2, z);
753 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
754 y2 = _mm_mul_ps(y2, z);
755 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
756 y2 = _mm_mul_ps(y2, z);
757 y2 = _mm_mul_ps(y2, x);
758 y2 = _mm_add_ps(y2, x);
759
760 /* select the correct result from the two polynoms */
761 xmm3 = poly_mask;
762 v4sf ysin2 = _mm_and_ps(xmm3, y2);
763 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
764 y2 = _mm_sub_ps(y2,ysin2);
765 y = _mm_sub_ps(y, ysin1);
766
767 xmm1 = _mm_add_ps(ysin1,ysin2);
768 xmm2 = _mm_add_ps(y,y2);
769
770 /* update the sign */
771 *s = _mm_xor_ps(xmm1, sign_bit_sin);
772 *c = _mm_xor_ps(xmm2, sign_bit_cos);
773}
774
775#endif
v4sf sin_ps(v4sf x)
Definition: SseMathFuncs.h:395
union xmm_mm_union xmm_mm_union
v4sf log_ps(v4sf x)
Definition: SseMathFuncs.h:175
v4sf cos_ps(v4sf x)
Definition: SseMathFuncs.h:512
__m64 v2si
Definition: SseMathFuncs.h:114
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_)
Definition: SseMathFuncs.h:166
#define _PS_CONST(Name, Val)
Definition: SseMathFuncs.h:118
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_)
Definition: SseMathFuncs.h:160
v4sf exp_ps(v4sf x)
Definition: SseMathFuncs.h:277
void sincos_ps(v4sf x, v4sf *s, v4sf *c)
Definition: SseMathFuncs.h:631
#define _PI32_CONST(Name, Val)
Definition: SseMathFuncs.h:120
#define _PS_CONST_TYPE(Name, Type, Val)
Definition: SseMathFuncs.h:122
__m128 v4sf
Definition: SseMathFuncs.h:108
__m64 mm[2]
Definition: SseMathFuncs.h:157