Audacity 3.2.0
Classes | Macros | Typedefs | Functions
SseMathFuncs.h File Reference

SSE maths functions (for FFTs) More...

#include <xmmintrin.h>
Include dependency graph for SseMathFuncs.h:

Go to the source code of this file.

Classes

union  xmm_mm_union
 

Macros

#define ALIGN16_BEG
 
#define ALIGN16_END   __attribute__((aligned(16)))
 
#define _PS_CONST(Name, Val)    static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
 
#define _PI32_CONST(Name, Val)    static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
 
#define _PS_CONST_TYPE(Name, Type, Val)    static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
 
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_)
 
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_)
 

Typedefs

typedef __m128 v4sf
 
typedef __m64 v2si
 
typedef union xmm_mm_union xmm_mm_union
 

Functions

 _PS_CONST (1, 1.0f)
 
 _PS_CONST (0p5, 0.5f)
 
 _PS_CONST_TYPE (min_norm_pos, int, 0x00800000)
 
 _PS_CONST_TYPE (mant_mask, int, 0x7f800000)
 
 _PS_CONST_TYPE (inv_mant_mask, int, ~0x7f800000)
 
 _PS_CONST_TYPE (sign_mask, int,(int) 0x80000000)
 
 _PS_CONST_TYPE (inv_sign_mask, int, ~0x80000000)
 
 _PI32_CONST (1, 1)
 
 _PI32_CONST (inv1, ~1)
 
 _PI32_CONST (2, 2)
 
 _PI32_CONST (4, 4)
 
 _PI32_CONST (0x7f, 0x7f)
 
 _PS_CONST (cephes_SQRTHF, 0.707106781186547524)
 
 _PS_CONST (cephes_log_p0, 7.0376836292E-2)
 
 _PS_CONST (cephes_log_p1, - 1.1514610310E-1)
 
 _PS_CONST (cephes_log_p2, 1.1676998740E-1)
 
 _PS_CONST (cephes_log_p3, - 1.2420140846E-1)
 
 _PS_CONST (cephes_log_p4,+1.4249322787E-1)
 
 _PS_CONST (cephes_log_p5, - 1.6668057665E-1)
 
 _PS_CONST (cephes_log_p6,+2.0000714765E-1)
 
 _PS_CONST (cephes_log_p7, - 2.4999993993E-1)
 
 _PS_CONST (cephes_log_p8,+3.3333331174E-1)
 
 _PS_CONST (cephes_log_q1, -2.12194440e-4)
 
 _PS_CONST (cephes_log_q2, 0.693359375)
 
v4sf log_ps (v4sf x)
 
 _PS_CONST (exp_hi, 88.3762626647949f)
 
 _PS_CONST (exp_lo, -88.3762626647949f)
 
 _PS_CONST (cephes_LOG2EF, 1.44269504088896341)
 
 _PS_CONST (cephes_exp_C1, 0.693359375)
 
 _PS_CONST (cephes_exp_C2, -2.12194440e-4)
 
 _PS_CONST (cephes_exp_p0, 1.9875691500E-4)
 
 _PS_CONST (cephes_exp_p1, 1.3981999507E-3)
 
 _PS_CONST (cephes_exp_p2, 8.3334519073E-3)
 
 _PS_CONST (cephes_exp_p3, 4.1665795894E-2)
 
 _PS_CONST (cephes_exp_p4, 1.6666665459E-1)
 
 _PS_CONST (cephes_exp_p5, 5.0000001201E-1)
 
v4sf exp_ps (v4sf x)
 
 _PS_CONST (minus_cephes_DP1, -0.78515625)
 
 _PS_CONST (minus_cephes_DP2, -2.4187564849853515625e-4)
 
 _PS_CONST (minus_cephes_DP3, -3.77489497744594108e-8)
 
 _PS_CONST (sincof_p0, -1.9515295891E-4)
 
 _PS_CONST (sincof_p1, 8.3321608736E-3)
 
 _PS_CONST (sincof_p2, -1.6666654611E-1)
 
 _PS_CONST (coscof_p0, 2.443315711809948E-005)
 
 _PS_CONST (coscof_p1, -1.388731625493765E-003)
 
 _PS_CONST (coscof_p2, 4.166664568298827E-002)
 
 _PS_CONST (cephes_FOPI, 1.27323954473516)
 
v4sf sin_ps (v4sf x)
 
v4sf cos_ps (v4sf x)
 
void sincos_ps (v4sf x, v4sf *s, v4sf *c)
 

Detailed Description

SSE maths functions (for FFTs)

Definition in file SseMathFuncs.h.

Macro Definition Documentation

◆ _PI32_CONST

#define _PI32_CONST (   Name,
  Val 
)     static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

Definition at line 120 of file SseMathFuncs.h.

◆ _PS_CONST

#define _PS_CONST (   Name,
  Val 
)     static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

Definition at line 118 of file SseMathFuncs.h.

◆ _PS_CONST_TYPE

#define _PS_CONST_TYPE (   Name,
  Type,
  Val 
)     static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

Definition at line 122 of file SseMathFuncs.h.

◆ ALIGN16_BEG

#define ALIGN16_BEG

Definition at line 103 of file SseMathFuncs.h.

◆ ALIGN16_END

#define ALIGN16_END   __attribute__((aligned(16)))

Definition at line 104 of file SseMathFuncs.h.

◆ COPY_MM_TO_XMM

#define COPY_MM_TO_XMM (   mm0_,
  mm1_,
  xmm_ 
)
Value:
{ \
xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
}

Definition at line 166 of file SseMathFuncs.h.

◆ COPY_XMM_TO_MM

#define COPY_XMM_TO_MM (   xmm_,
  mm0_,
  mm1_ 
)
Value:
{ \
xmm_mm_union u; u.xmm = xmm_; \
mm0_ = u.mm[0]; \
mm1_ = u.mm[1]; \
}

Definition at line 160 of file SseMathFuncs.h.

Typedef Documentation

◆ v2si

typedef __m64 v2si

Definition at line 114 of file SseMathFuncs.h.

◆ v4sf

typedef __m128 v4sf

Definition at line 108 of file SseMathFuncs.h.

◆ xmm_mm_union

typedef union xmm_mm_union xmm_mm_union

Function Documentation

◆ _PI32_CONST() [1/5]

_PI32_CONST ( 0x7f  ,
0x7f   
)

◆ _PI32_CONST() [2/5]

_PI32_CONST ( ,
 
)

◆ _PI32_CONST() [3/5]

_PI32_CONST ( ,
 
)

◆ _PI32_CONST() [4/5]

_PI32_CONST ( ,
 
)

◆ _PI32_CONST() [5/5]

_PI32_CONST ( inv1  ,
1 
)

◆ _PS_CONST() [1/35]

_PS_CONST ( 0p5  ,
0.  5f 
)

◆ _PS_CONST() [2/35]

_PS_CONST ( ,
1.  0f 
)

◆ _PS_CONST() [3/35]

_PS_CONST ( cephes_exp_C1  ,
0.  693359375 
)

◆ _PS_CONST() [4/35]

_PS_CONST ( cephes_exp_C2  ,
-2.12194440e-  4 
)

◆ _PS_CONST() [5/35]

_PS_CONST ( cephes_exp_p0  ,
1.9875691500E-  4 
)

◆ _PS_CONST() [6/35]

_PS_CONST ( cephes_exp_p1  ,
1.3981999507E-  3 
)

◆ _PS_CONST() [7/35]

_PS_CONST ( cephes_exp_p2  ,
8.3334519073E-  3 
)

◆ _PS_CONST() [8/35]

_PS_CONST ( cephes_exp_p3  ,
4.1665795894E-  2 
)

◆ _PS_CONST() [9/35]

_PS_CONST ( cephes_exp_p4  ,
1.6666665459E-  1 
)

◆ _PS_CONST() [10/35]

_PS_CONST ( cephes_exp_p5  ,
5.0000001201E-  1 
)

◆ _PS_CONST() [11/35]

_PS_CONST ( cephes_FOPI  ,
1.  27323954473516 
)

◆ _PS_CONST() [12/35]

_PS_CONST ( cephes_LOG2EF  ,
1.  44269504088896341 
)

◆ _PS_CONST() [13/35]

_PS_CONST ( cephes_log_p0  ,
7.0376836292E-  2 
)

◆ _PS_CONST() [14/35]

_PS_CONST ( cephes_log_p1  ,
- 1.1514610310E-  1 
)

◆ _PS_CONST() [15/35]

_PS_CONST ( cephes_log_p2  ,
1.1676998740E-  1 
)

◆ _PS_CONST() [16/35]

_PS_CONST ( cephes_log_p3  ,
- 1.2420140846E-  1 
)

◆ _PS_CONST() [17/35]

_PS_CONST ( cephes_log_p4  ,
+1.4249322787E-  1 
)

◆ _PS_CONST() [18/35]

_PS_CONST ( cephes_log_p5  ,
- 1.6668057665E-  1 
)

◆ _PS_CONST() [19/35]

_PS_CONST ( cephes_log_p6  ,
+2.0000714765E-  1 
)

◆ _PS_CONST() [20/35]

_PS_CONST ( cephes_log_p7  ,
- 2.4999993993E-  1 
)

◆ _PS_CONST() [21/35]

_PS_CONST ( cephes_log_p8  ,
+3.3333331174E-  1 
)

◆ _PS_CONST() [22/35]

_PS_CONST ( cephes_log_q1  ,
-2.12194440e-  4 
)

◆ _PS_CONST() [23/35]

_PS_CONST ( cephes_log_q2  ,
0.  693359375 
)

◆ _PS_CONST() [24/35]

_PS_CONST ( cephes_SQRTHF  ,
0.  707106781186547524 
)

◆ _PS_CONST() [25/35]

_PS_CONST ( coscof_p0  ,
2.443315711809948E-  005 
)

◆ _PS_CONST() [26/35]

_PS_CONST ( coscof_p1  ,
-1.388731625493765E-  003 
)

◆ _PS_CONST() [27/35]

_PS_CONST ( coscof_p2  ,
4.166664568298827E-  002 
)

◆ _PS_CONST() [28/35]

_PS_CONST ( exp_hi  ,
88.  3762626647949f 
)

◆ _PS_CONST() [29/35]

_PS_CONST ( exp_lo  ,
-88.  3762626647949f 
)

◆ _PS_CONST() [30/35]

_PS_CONST ( minus_cephes_DP1  ,
-0.  78515625 
)

◆ _PS_CONST() [31/35]

_PS_CONST ( minus_cephes_DP2  ,
-2.4187564849853515625e-  4 
)

◆ _PS_CONST() [32/35]

_PS_CONST ( minus_cephes_DP3  ,
-3.77489497744594108e-  8 
)

◆ _PS_CONST() [33/35]

_PS_CONST ( sincof_p0  ,
-1.9515295891E-  4 
)

◆ _PS_CONST() [34/35]

_PS_CONST ( sincof_p1  ,
8.3321608736E-  3 
)

◆ _PS_CONST() [35/35]

_PS_CONST ( sincof_p2  ,
-1.6666654611E-  1 
)

◆ _PS_CONST_TYPE() [1/5]

_PS_CONST_TYPE ( inv_mant_mask  ,
int  ,
0x7f800000 
)

◆ _PS_CONST_TYPE() [2/5]

_PS_CONST_TYPE ( inv_sign_mask  ,
int  ,
0x80000000 
)

◆ _PS_CONST_TYPE() [3/5]

_PS_CONST_TYPE ( mant_mask  ,
int  ,
0x7f800000   
)

◆ _PS_CONST_TYPE() [4/5]

_PS_CONST_TYPE ( min_norm_pos  ,
int  ,
0x00800000   
)

◆ _PS_CONST_TYPE() [5/5]

_PS_CONST_TYPE ( sign_mask  ,
int  ,
(int)  0x80000000 
)

◆ cos_ps()

v4sf cos_ps ( v4sf  x)

Definition at line 512 of file SseMathFuncs.h.

512 { // any x
513 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
514#ifdef USE_SSE2
515 v4si emm0, emm2;
516#else
517 v2si mm0, mm1, mm2, mm3;
518#endif
519 /* take the absolute value */
520 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
521
522 /* scale by 4/Pi */
523 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
524
525#ifdef USE_SSE2
526 /* store the integer part of y in mm0 */
527 emm2 = _mm_cvttps_epi32(y);
528 /* j=(j+1) & (~1) (see the cephes sources) */
529 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
530 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
531 y = _mm_cvtepi32_ps(emm2);
532
533 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
534
535 /* get the swap sign flag */
536 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
537 emm0 = _mm_slli_epi32(emm0, 29);
538 /* get the polynom selection mask */
539 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
540 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
541
542 v4sf sign_bit = _mm_castsi128_ps(emm0);
543 v4sf poly_mask = _mm_castsi128_ps(emm2);
544#else
545 /* store the integer part of y in mm0:mm1 */
546 xmm2 = _mm_movehl_ps(xmm2, y);
547 mm2 = _mm_cvttps_pi32(y);
548 mm3 = _mm_cvttps_pi32(xmm2);
549
550 /* j=(j+1) & (~1) (see the cephes sources) */
551 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
552 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
553 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
554 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
555
556 y = _mm_cvtpi32x2_ps(mm2, mm3);
557
558
559 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
560 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
561
562 /* get the swap sign flag in mm0:mm1 and the
563 polynom selection mask in mm2:mm3 */
564
565 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
566 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
567 mm0 = _mm_slli_pi32(mm0, 29);
568 mm1 = _mm_slli_pi32(mm1, 29);
569
570 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
571 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
572
573 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
574 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
575
576 v4sf sign_bit, poly_mask;
577 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
578 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
579 _mm_empty(); /* good-bye mmx */
580#endif
581 /* The magic pass: "Extended precision modular arithmetic"
582 x = ((x - y * DP1) - y * DP2) - y * DP3; */
583 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
584 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
585 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
586 xmm1 = _mm_mul_ps(y, xmm1);
587 xmm2 = _mm_mul_ps(y, xmm2);
588 xmm3 = _mm_mul_ps(y, xmm3);
589 x = _mm_add_ps(x, xmm1);
590 x = _mm_add_ps(x, xmm2);
591 x = _mm_add_ps(x, xmm3);
592
593 /* Evaluate the first polynom (0 <= x <= Pi/4) */
594 y = *(v4sf*)_ps_coscof_p0;
595 v4sf z = _mm_mul_ps(x,x);
596
597 y = _mm_mul_ps(y, z);
598 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
599 y = _mm_mul_ps(y, z);
600 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
601 y = _mm_mul_ps(y, z);
602 y = _mm_mul_ps(y, z);
603 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
604 y = _mm_sub_ps(y, tmp);
605 y = _mm_add_ps(y, *(v4sf*)_ps_1);
606
607 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
608
609 v4sf y2 = *(v4sf*)_ps_sincof_p0;
610 y2 = _mm_mul_ps(y2, z);
611 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
612 y2 = _mm_mul_ps(y2, z);
613 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
614 y2 = _mm_mul_ps(y2, z);
615 y2 = _mm_mul_ps(y2, x);
616 y2 = _mm_add_ps(y2, x);
617
618 /* select the correct result from the two polynoms */
619 xmm3 = poly_mask;
620 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
621 y = _mm_andnot_ps(xmm3, y);
622 y = _mm_add_ps(y,y2);
623 /* update the sign */
624 y = _mm_xor_ps(y, sign_bit);
625
626 return y;
627}
__m64 v2si
Definition: SseMathFuncs.h:114
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_)
Definition: SseMathFuncs.h:166
__m128 v4sf
Definition: SseMathFuncs.h:108

References COPY_MM_TO_XMM.

◆ exp_ps()

v4sf exp_ps ( v4sf  x)

Definition at line 277 of file SseMathFuncs.h.

277 {
278 v4sf tmp = _mm_setzero_ps(), fx;
279#ifdef USE_SSE2
280 v4si emm0;
281#else
282 v2si mm0, mm1;
283#endif
284 v4sf one = *(v4sf*)_ps_1;
285
286 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
287 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
288
289 /* express exp(x) as exp(g + n*log(2)) */
290 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
291 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
292
293 /* how to perform a floorf with SSE: just below */
294#ifndef USE_SSE2
295 /* step 1 : cast to int */
296 tmp = _mm_movehl_ps(tmp, fx);
297 mm0 = _mm_cvttps_pi32(fx);
298 mm1 = _mm_cvttps_pi32(tmp);
299 /* step 2 : cast back to float */
300 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
301#else
302 emm0 = _mm_cvttps_epi32(fx);
303 tmp = _mm_cvtepi32_ps(emm0);
304#endif
305 /* if greater, subtract 1 */
306 v4sf mask = _mm_cmpgt_ps(tmp, fx);
307 mask = _mm_and_ps(mask, one);
308 fx = _mm_sub_ps(tmp, mask);
309
310 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
311 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
312 x = _mm_sub_ps(x, tmp);
313 x = _mm_sub_ps(x, z);
314
315 z = _mm_mul_ps(x,x);
316
317 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
318 y = _mm_mul_ps(y, x);
319 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
320 y = _mm_mul_ps(y, x);
321 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
322 y = _mm_mul_ps(y, x);
323 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
324 y = _mm_mul_ps(y, x);
325 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
326 y = _mm_mul_ps(y, x);
327 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
328 y = _mm_mul_ps(y, z);
329 y = _mm_add_ps(y, x);
330 y = _mm_add_ps(y, one);
331
332 /* build 2^n */
333#ifndef USE_SSE2
334 z = _mm_movehl_ps(z, fx);
335 mm0 = _mm_cvttps_pi32(fx);
336 mm1 = _mm_cvttps_pi32(z);
337 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
338 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
339 mm0 = _mm_slli_pi32(mm0, 23);
340 mm1 = _mm_slli_pi32(mm1, 23);
341
342 v4sf pow2n;
343 COPY_MM_TO_XMM(mm0, mm1, pow2n);
344 _mm_empty();
345#else
346 emm0 = _mm_cvttps_epi32(fx);
347 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
348 emm0 = _mm_slli_epi32(emm0, 23);
349 v4sf pow2n = _mm_castsi128_ps(emm0);
350#endif
351 y = _mm_mul_ps(y, pow2n);
352 return y;
353}

References COPY_MM_TO_XMM.

◆ log_ps()

v4sf log_ps ( v4sf  x)

Definition at line 175 of file SseMathFuncs.h.

175 {
176#ifdef USE_SSE2
177 v4si emm0;
178#else
179 v2si mm0, mm1;
180#endif
181 v4sf one = *(v4sf*)_ps_1;
182
183 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
184
185 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
186
187#ifndef USE_SSE2
188 /* part 1: x = frexpf(x, &e); */
189 COPY_XMM_TO_MM(x, mm0, mm1);
190 mm0 = _mm_srli_pi32(mm0, 23);
191 mm1 = _mm_srli_pi32(mm1, 23);
192#else
193 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
194#endif
195 /* keep only the fractional part */
196 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
197 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
198
199#ifndef USE_SSE2
200 /* now e=mm0:mm1 contain the really base-2 exponent */
201 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
202 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
203 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
204 _mm_empty(); /* bye bye mmx */
205#else
206 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
207 v4sf e = _mm_cvtepi32_ps(emm0);
208#endif
209
210 e = _mm_add_ps(e, one);
211
212 /* part2:
213 if( x < SQRTHF ) {
214 e -= 1;
215 x = x + x - 1.0;
216 } else { x = x - 1.0; }
217 */
218 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
219 v4sf tmp = _mm_and_ps(x, mask);
220 x = _mm_sub_ps(x, one);
221 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
222 x = _mm_add_ps(x, tmp);
223
224
225 v4sf z = _mm_mul_ps(x,x);
226
227 v4sf y = *(v4sf*)_ps_cephes_log_p0;
228 y = _mm_mul_ps(y, x);
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
230 y = _mm_mul_ps(y, x);
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
232 y = _mm_mul_ps(y, x);
233 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
234 y = _mm_mul_ps(y, x);
235 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
236 y = _mm_mul_ps(y, x);
237 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
238 y = _mm_mul_ps(y, x);
239 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
240 y = _mm_mul_ps(y, x);
241 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
242 y = _mm_mul_ps(y, x);
243 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
244 y = _mm_mul_ps(y, x);
245
246 y = _mm_mul_ps(y, z);
247
248
249 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
250 y = _mm_add_ps(y, tmp);
251
252
253 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
254 y = _mm_sub_ps(y, tmp);
255
256 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
257 x = _mm_add_ps(x, y);
258 x = _mm_add_ps(x, tmp);
259 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
260 return x;
261}
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_)
Definition: SseMathFuncs.h:160

References COPY_XMM_TO_MM.

◆ sin_ps()

v4sf sin_ps ( v4sf  x)

Definition at line 395 of file SseMathFuncs.h.

395 { // any x
396 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
397
398#ifdef USE_SSE2
399 v4si emm0, emm2;
400#else
401 v2si mm0, mm1, mm2, mm3;
402#endif
403 sign_bit = x;
404 /* take the absolute value */
405 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
406 /* extract the sign bit (upper one) */
407 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
408
409 /* scale by 4/Pi */
410 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
411
412#ifdef USE_SSE2
413 /* store the integer part of y in mm0 */
414 emm2 = _mm_cvttps_epi32(y);
415 /* j=(j+1) & (~1) (see the cephes sources) */
416 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
417 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
418 y = _mm_cvtepi32_ps(emm2);
419
420 /* get the swap sign flag */
421 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
422 emm0 = _mm_slli_epi32(emm0, 29);
423 /* get the polynom selection mask
424 there is one polynom for 0 <= x <= Pi/4
425 and another one for Pi/4<x<=Pi/2
426
427 Both branches will be computed.
428 */
429 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
430 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
431
432 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
433 v4sf poly_mask = _mm_castsi128_ps(emm2);
434 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
435
436#else
437 /* store the integer part of y in mm0:mm1 */
438 xmm2 = _mm_movehl_ps(xmm2, y);
439 mm2 = _mm_cvttps_pi32(y);
440 mm3 = _mm_cvttps_pi32(xmm2);
441 /* j=(j+1) & (~1) (see the cephes sources) */
442 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
443 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
444 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
445 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
446 y = _mm_cvtpi32x2_ps(mm2, mm3);
447 /* get the swap sign flag */
448 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
449 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
450 mm0 = _mm_slli_pi32(mm0, 29);
451 mm1 = _mm_slli_pi32(mm1, 29);
452 /* get the polynom selection mask */
453 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
454 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
455 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
456 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
457 v4sf swap_sign_bit, poly_mask;
458 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
459 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
460 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
461 _mm_empty(); /* good-bye mmx */
462#endif
463
464 /* The magic pass: "Extended precision modular arithmetic"
465 x = ((x - y * DP1) - y * DP2) - y * DP3; */
466 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
467 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
468 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
469 xmm1 = _mm_mul_ps(y, xmm1);
470 xmm2 = _mm_mul_ps(y, xmm2);
471 xmm3 = _mm_mul_ps(y, xmm3);
472 x = _mm_add_ps(x, xmm1);
473 x = _mm_add_ps(x, xmm2);
474 x = _mm_add_ps(x, xmm3);
475
476 /* Evaluate the first polynom (0 <= x <= Pi/4) */
477 y = *(v4sf*)_ps_coscof_p0;
478 v4sf z = _mm_mul_ps(x,x);
479
480 y = _mm_mul_ps(y, z);
481 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
482 y = _mm_mul_ps(y, z);
483 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
484 y = _mm_mul_ps(y, z);
485 y = _mm_mul_ps(y, z);
486 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
487 y = _mm_sub_ps(y, tmp);
488 y = _mm_add_ps(y, *(v4sf*)_ps_1);
489
490 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
491
492 v4sf y2 = *(v4sf*)_ps_sincof_p0;
493 y2 = _mm_mul_ps(y2, z);
494 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
495 y2 = _mm_mul_ps(y2, z);
496 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
497 y2 = _mm_mul_ps(y2, z);
498 y2 = _mm_mul_ps(y2, x);
499 y2 = _mm_add_ps(y2, x);
500
501 /* select the correct result from the two polynoms */
502 xmm3 = poly_mask;
503 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
504 y = _mm_andnot_ps(xmm3, y);
505 y = _mm_add_ps(y,y2);
506 /* update the sign */
507 y = _mm_xor_ps(y, sign_bit);
508 return y;
509}

References COPY_MM_TO_XMM.

◆ sincos_ps()

void sincos_ps ( v4sf  x,
v4sf s,
v4sf c 
)

Definition at line 631 of file SseMathFuncs.h.

631 {
632 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
633#ifdef USE_SSE2
634 v4si emm0, emm2, emm4;
635#else
636 v2si mm0, mm1, mm2, mm3, mm4, mm5;
637#endif
638 sign_bit_sin = x;
639 /* take the absolute value */
640 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
641 /* extract the sign bit (upper one) */
642 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
643
644 /* scale by 4/Pi */
645 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
646
647#ifdef USE_SSE2
648 /* store the integer part of y in emm2 */
649 emm2 = _mm_cvttps_epi32(y);
650
651 /* j=(j+1) & (~1) (see the cephes sources) */
652 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
653 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
654 y = _mm_cvtepi32_ps(emm2);
655
656 emm4 = emm2;
657
658 /* get the swap sign flag for the sine */
659 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
660 emm0 = _mm_slli_epi32(emm0, 29);
661 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
662
663 /* get the polynom selection mask for the sine*/
664 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
665 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
666 v4sf poly_mask = _mm_castsi128_ps(emm2);
667#else
668 /* store the integer part of y in mm2:mm3 */
669 xmm3 = _mm_movehl_ps(xmm3, y);
670 mm2 = _mm_cvttps_pi32(y);
671 mm3 = _mm_cvttps_pi32(xmm3);
672
673 /* j=(j+1) & (~1) (see the cephes sources) */
674 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
675 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
676 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
677 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
678
679 y = _mm_cvtpi32x2_ps(mm2, mm3);
680
681 mm4 = mm2;
682 mm5 = mm3;
683
684 /* get the swap sign flag for the sine */
685 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
686 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
687 mm0 = _mm_slli_pi32(mm0, 29);
688 mm1 = _mm_slli_pi32(mm1, 29);
689 v4sf swap_sign_bit_sin;
690 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
691
692 /* get the polynom selection mask for the sine */
693
694 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
695 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
696 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
697 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
698 v4sf poly_mask;
699 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
700#endif
701
702 /* The magic pass: "Extended precision modular arithmetic"
703 x = ((x - y * DP1) - y * DP2) - y * DP3; */
704 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
705 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
706 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
707 xmm1 = _mm_mul_ps(y, xmm1);
708 xmm2 = _mm_mul_ps(y, xmm2);
709 xmm3 = _mm_mul_ps(y, xmm3);
710 x = _mm_add_ps(x, xmm1);
711 x = _mm_add_ps(x, xmm2);
712 x = _mm_add_ps(x, xmm3);
713
714#ifdef USE_SSE2
715 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
716 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
717 emm4 = _mm_slli_epi32(emm4, 29);
718 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
719#else
720 /* get the sign flag for the cosine */
721 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
722 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
723 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
724 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
725 mm4 = _mm_slli_pi32(mm4, 29);
726 mm5 = _mm_slli_pi32(mm5, 29);
727 v4sf sign_bit_cos;
728 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
729 _mm_empty(); /* good-bye mmx */
730#endif
731
732 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
733
734
735 /* Evaluate the first polynom (0 <= x <= Pi/4) */
736 v4sf z = _mm_mul_ps(x,x);
737 y = *(v4sf*)_ps_coscof_p0;
738
739 y = _mm_mul_ps(y, z);
740 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
741 y = _mm_mul_ps(y, z);
742 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
743 y = _mm_mul_ps(y, z);
744 y = _mm_mul_ps(y, z);
745 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
746 y = _mm_sub_ps(y, tmp);
747 y = _mm_add_ps(y, *(v4sf*)_ps_1);
748
749 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
750
751 v4sf y2 = *(v4sf*)_ps_sincof_p0;
752 y2 = _mm_mul_ps(y2, z);
753 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
754 y2 = _mm_mul_ps(y2, z);
755 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
756 y2 = _mm_mul_ps(y2, z);
757 y2 = _mm_mul_ps(y2, x);
758 y2 = _mm_add_ps(y2, x);
759
760 /* select the correct result from the two polynoms */
761 xmm3 = poly_mask;
762 v4sf ysin2 = _mm_and_ps(xmm3, y2);
763 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
764 y2 = _mm_sub_ps(y2,ysin2);
765 y = _mm_sub_ps(y, ysin1);
766
767 xmm1 = _mm_add_ps(ysin1,ysin2);
768 xmm2 = _mm_add_ps(y,y2);
769
770 /* update the sign */
771 *s = _mm_xor_ps(xmm1, sign_bit_sin);
772 *c = _mm_xor_ps(xmm2, sign_bit_cos);
773}

References COPY_MM_TO_XMM.