100# define ALIGN16_BEG __declspec(align(16))
104# define ALIGN16_END __attribute__((aligned(16)))
111# include <emmintrin.h>
118#define _PS_CONST(Name, Val) \
119 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
120#define _PI32_CONST(Name, Val) \
121 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
122#define _PS_CONST_TYPE(Name, Type, Val) \
123 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
160#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
161 xmm_mm_union u; u.xmm = xmm_; \
166#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
167 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
183 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
185 x = _mm_max_ps(x, *(
v4sf*)_ps_min_norm_pos);
190 mm0 = _mm_srli_pi32(mm0, 23);
191 mm1 = _mm_srli_pi32(mm1, 23);
193 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
196 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_mant_mask);
197 x = _mm_or_ps(x, *(
v4sf*)_ps_0p5);
201 mm0 = _mm_sub_pi32(mm0, *(
v2si*)_pi32_0x7f);
202 mm1 = _mm_sub_pi32(mm1, *(
v2si*)_pi32_0x7f);
203 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
206 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
207 v4sf e = _mm_cvtepi32_ps(emm0);
210 e = _mm_add_ps(e, one);
218 v4sf mask = _mm_cmplt_ps(x, *(
v4sf*)_ps_cephes_SQRTHF);
219 v4sf tmp = _mm_and_ps(x, mask);
220 x = _mm_sub_ps(x, one);
221 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
222 x = _mm_add_ps(x, tmp);
225 v4sf z = _mm_mul_ps(x,x);
228 y = _mm_mul_ps(y, x);
229 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p1);
230 y = _mm_mul_ps(y, x);
231 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p2);
232 y = _mm_mul_ps(y, x);
233 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p3);
234 y = _mm_mul_ps(y, x);
235 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p4);
236 y = _mm_mul_ps(y, x);
237 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p5);
238 y = _mm_mul_ps(y, x);
239 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p6);
240 y = _mm_mul_ps(y, x);
241 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p7);
242 y = _mm_mul_ps(y, x);
243 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_log_p8);
244 y = _mm_mul_ps(y, x);
246 y = _mm_mul_ps(y, z);
249 tmp = _mm_mul_ps(e, *(
v4sf*)_ps_cephes_log_q1);
250 y = _mm_add_ps(y, tmp);
253 tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
254 y = _mm_sub_ps(y, tmp);
256 tmp = _mm_mul_ps(e, *(
v4sf*)_ps_cephes_log_q2);
257 x = _mm_add_ps(x, y);
258 x = _mm_add_ps(x, tmp);
259 x = _mm_or_ps(x, invalid_mask);
278 v4sf tmp = _mm_setzero_ps(), fx;
286 x = _mm_min_ps(x, *(
v4sf*)_ps_exp_hi);
287 x = _mm_max_ps(x, *(
v4sf*)_ps_exp_lo);
290 fx = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_LOG2EF);
291 fx = _mm_add_ps(fx, *(
v4sf*)_ps_0p5);
296 tmp = _mm_movehl_ps(tmp, fx);
297 mm0 = _mm_cvttps_pi32(fx);
298 mm1 = _mm_cvttps_pi32(tmp);
300 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
302 emm0 = _mm_cvttps_epi32(fx);
303 tmp = _mm_cvtepi32_ps(emm0);
306 v4sf mask = _mm_cmpgt_ps(tmp, fx);
307 mask = _mm_and_ps(mask, one);
308 fx = _mm_sub_ps(tmp, mask);
310 tmp = _mm_mul_ps(fx, *(
v4sf*)_ps_cephes_exp_C1);
311 v4sf z = _mm_mul_ps(fx, *(
v4sf*)_ps_cephes_exp_C2);
312 x = _mm_sub_ps(x, tmp);
313 x = _mm_sub_ps(x, z);
318 y = _mm_mul_ps(y, x);
319 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p1);
320 y = _mm_mul_ps(y, x);
321 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p2);
322 y = _mm_mul_ps(y, x);
323 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p3);
324 y = _mm_mul_ps(y, x);
325 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p4);
326 y = _mm_mul_ps(y, x);
327 y = _mm_add_ps(y, *(
v4sf*)_ps_cephes_exp_p5);
328 y = _mm_mul_ps(y, z);
329 y = _mm_add_ps(y, x);
330 y = _mm_add_ps(y, one);
334 z = _mm_movehl_ps(z, fx);
335 mm0 = _mm_cvttps_pi32(fx);
336 mm1 = _mm_cvttps_pi32(z);
337 mm0 = _mm_add_pi32(mm0, *(
v2si*)_pi32_0x7f);
338 mm1 = _mm_add_pi32(mm1, *(
v2si*)_pi32_0x7f);
339 mm0 = _mm_slli_pi32(mm0, 23);
340 mm1 = _mm_slli_pi32(mm1, 23);
346 emm0 = _mm_cvttps_epi32(fx);
347 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
348 emm0 = _mm_slli_epi32(emm0, 23);
349 v4sf pow2n = _mm_castsi128_ps(emm0);
351 y = _mm_mul_ps(y, pow2n);
396 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
401 v2si mm0, mm1, mm2, mm3;
405 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_sign_mask);
407 sign_bit = _mm_and_ps(sign_bit, *(
v4sf*)_ps_sign_mask);
410 y = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_FOPI);
414 emm2 = _mm_cvttps_epi32(y);
416 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
417 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
418 y = _mm_cvtepi32_ps(emm2);
421 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
422 emm0 = _mm_slli_epi32(emm0, 29);
429 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
430 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
432 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
433 v4sf poly_mask = _mm_castsi128_ps(emm2);
434 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
438 xmm2 = _mm_movehl_ps(xmm2, y);
439 mm2 = _mm_cvttps_pi32(y);
440 mm3 = _mm_cvttps_pi32(xmm2);
442 mm2 = _mm_add_pi32(mm2, *(
v2si*)_pi32_1);
443 mm3 = _mm_add_pi32(mm3, *(
v2si*)_pi32_1);
444 mm2 = _mm_and_si64(mm2, *(
v2si*)_pi32_inv1);
445 mm3 = _mm_and_si64(mm3, *(
v2si*)_pi32_inv1);
446 y = _mm_cvtpi32x2_ps(mm2, mm3);
448 mm0 = _mm_and_si64(mm2, *(
v2si*)_pi32_4);
449 mm1 = _mm_and_si64(mm3, *(
v2si*)_pi32_4);
450 mm0 = _mm_slli_pi32(mm0, 29);
451 mm1 = _mm_slli_pi32(mm1, 29);
453 mm2 = _mm_and_si64(mm2, *(
v2si*)_pi32_2);
454 mm3 = _mm_and_si64(mm3, *(
v2si*)_pi32_2);
455 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
456 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
457 v4sf swap_sign_bit, poly_mask;
460 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
466 xmm1 = *(
v4sf*)_ps_minus_cephes_DP1;
467 xmm2 = *(
v4sf*)_ps_minus_cephes_DP2;
468 xmm3 = *(
v4sf*)_ps_minus_cephes_DP3;
469 xmm1 = _mm_mul_ps(y, xmm1);
470 xmm2 = _mm_mul_ps(y, xmm2);
471 xmm3 = _mm_mul_ps(y, xmm3);
472 x = _mm_add_ps(x, xmm1);
473 x = _mm_add_ps(x, xmm2);
474 x = _mm_add_ps(x, xmm3);
477 y = *(
v4sf*)_ps_coscof_p0;
478 v4sf z = _mm_mul_ps(x,x);
480 y = _mm_mul_ps(y, z);
481 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p1);
482 y = _mm_mul_ps(y, z);
483 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p2);
484 y = _mm_mul_ps(y, z);
485 y = _mm_mul_ps(y, z);
486 v4sf tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
487 y = _mm_sub_ps(y, tmp);
488 y = _mm_add_ps(y, *(
v4sf*)_ps_1);
493 y2 = _mm_mul_ps(y2, z);
494 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p1);
495 y2 = _mm_mul_ps(y2, z);
496 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p2);
497 y2 = _mm_mul_ps(y2, z);
498 y2 = _mm_mul_ps(y2, x);
499 y2 = _mm_add_ps(y2, x);
503 y2 = _mm_and_ps(xmm3, y2);
504 y = _mm_andnot_ps(xmm3, y);
505 y = _mm_add_ps(y,y2);
507 y = _mm_xor_ps(y, sign_bit);
513 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
517 v2si mm0, mm1, mm2, mm3;
520 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_sign_mask);
523 y = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_FOPI);
527 emm2 = _mm_cvttps_epi32(y);
529 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
530 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
531 y = _mm_cvtepi32_ps(emm2);
533 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
536 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
537 emm0 = _mm_slli_epi32(emm0, 29);
539 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
540 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
542 v4sf sign_bit = _mm_castsi128_ps(emm0);
543 v4sf poly_mask = _mm_castsi128_ps(emm2);
546 xmm2 = _mm_movehl_ps(xmm2, y);
547 mm2 = _mm_cvttps_pi32(y);
548 mm3 = _mm_cvttps_pi32(xmm2);
551 mm2 = _mm_add_pi32(mm2, *(
v2si*)_pi32_1);
552 mm3 = _mm_add_pi32(mm3, *(
v2si*)_pi32_1);
553 mm2 = _mm_and_si64(mm2, *(
v2si*)_pi32_inv1);
554 mm3 = _mm_and_si64(mm3, *(
v2si*)_pi32_inv1);
556 y = _mm_cvtpi32x2_ps(mm2, mm3);
559 mm2 = _mm_sub_pi32(mm2, *(
v2si*)_pi32_2);
560 mm3 = _mm_sub_pi32(mm3, *(
v2si*)_pi32_2);
565 mm0 = _mm_andnot_si64(mm2, *(
v2si*)_pi32_4);
566 mm1 = _mm_andnot_si64(mm3, *(
v2si*)_pi32_4);
567 mm0 = _mm_slli_pi32(mm0, 29);
568 mm1 = _mm_slli_pi32(mm1, 29);
570 mm2 = _mm_and_si64(mm2, *(
v2si*)_pi32_2);
571 mm3 = _mm_and_si64(mm3, *(
v2si*)_pi32_2);
573 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
574 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
576 v4sf sign_bit, poly_mask;
583 xmm1 = *(
v4sf*)_ps_minus_cephes_DP1;
584 xmm2 = *(
v4sf*)_ps_minus_cephes_DP2;
585 xmm3 = *(
v4sf*)_ps_minus_cephes_DP3;
586 xmm1 = _mm_mul_ps(y, xmm1);
587 xmm2 = _mm_mul_ps(y, xmm2);
588 xmm3 = _mm_mul_ps(y, xmm3);
589 x = _mm_add_ps(x, xmm1);
590 x = _mm_add_ps(x, xmm2);
591 x = _mm_add_ps(x, xmm3);
594 y = *(
v4sf*)_ps_coscof_p0;
595 v4sf z = _mm_mul_ps(x,x);
597 y = _mm_mul_ps(y, z);
598 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p1);
599 y = _mm_mul_ps(y, z);
600 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p2);
601 y = _mm_mul_ps(y, z);
602 y = _mm_mul_ps(y, z);
603 v4sf tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
604 y = _mm_sub_ps(y, tmp);
605 y = _mm_add_ps(y, *(
v4sf*)_ps_1);
610 y2 = _mm_mul_ps(y2, z);
611 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p1);
612 y2 = _mm_mul_ps(y2, z);
613 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p2);
614 y2 = _mm_mul_ps(y2, z);
615 y2 = _mm_mul_ps(y2, x);
616 y2 = _mm_add_ps(y2, x);
620 y2 = _mm_and_ps(xmm3, y2);
621 y = _mm_andnot_ps(xmm3, y);
622 y = _mm_add_ps(y,y2);
624 y = _mm_xor_ps(y, sign_bit);
632 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
634 v4si emm0, emm2, emm4;
636 v2si mm0, mm1, mm2, mm3, mm4, mm5;
640 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_sign_mask);
642 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(
v4sf*)_ps_sign_mask);
645 y = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_FOPI);
649 emm2 = _mm_cvttps_epi32(y);
652 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
653 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
654 y = _mm_cvtepi32_ps(emm2);
659 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
660 emm0 = _mm_slli_epi32(emm0, 29);
661 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
664 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
665 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
666 v4sf poly_mask = _mm_castsi128_ps(emm2);
669 xmm3 = _mm_movehl_ps(xmm3, y);
670 mm2 = _mm_cvttps_pi32(y);
671 mm3 = _mm_cvttps_pi32(xmm3);
674 mm2 = _mm_add_pi32(mm2, *(
v2si*)_pi32_1);
675 mm3 = _mm_add_pi32(mm3, *(
v2si*)_pi32_1);
676 mm2 = _mm_and_si64(mm2, *(
v2si*)_pi32_inv1);
677 mm3 = _mm_and_si64(mm3, *(
v2si*)_pi32_inv1);
679 y = _mm_cvtpi32x2_ps(mm2, mm3);
685 mm0 = _mm_and_si64(mm2, *(
v2si*)_pi32_4);
686 mm1 = _mm_and_si64(mm3, *(
v2si*)_pi32_4);
687 mm0 = _mm_slli_pi32(mm0, 29);
688 mm1 = _mm_slli_pi32(mm1, 29);
689 v4sf swap_sign_bit_sin;
694 mm2 = _mm_and_si64(mm2, *(
v2si*)_pi32_2);
695 mm3 = _mm_and_si64(mm3, *(
v2si*)_pi32_2);
696 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
697 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
704 xmm1 = *(
v4sf*)_ps_minus_cephes_DP1;
705 xmm2 = *(
v4sf*)_ps_minus_cephes_DP2;
706 xmm3 = *(
v4sf*)_ps_minus_cephes_DP3;
707 xmm1 = _mm_mul_ps(y, xmm1);
708 xmm2 = _mm_mul_ps(y, xmm2);
709 xmm3 = _mm_mul_ps(y, xmm3);
710 x = _mm_add_ps(x, xmm1);
711 x = _mm_add_ps(x, xmm2);
712 x = _mm_add_ps(x, xmm3);
715 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
716 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
717 emm4 = _mm_slli_epi32(emm4, 29);
718 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
721 mm4 = _mm_sub_pi32(mm4, *(
v2si*)_pi32_2);
722 mm5 = _mm_sub_pi32(mm5, *(
v2si*)_pi32_2);
723 mm4 = _mm_andnot_si64(mm4, *(
v2si*)_pi32_4);
724 mm5 = _mm_andnot_si64(mm5, *(
v2si*)_pi32_4);
725 mm4 = _mm_slli_pi32(mm4, 29);
726 mm5 = _mm_slli_pi32(mm5, 29);
732 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
736 v4sf z = _mm_mul_ps(x,x);
737 y = *(
v4sf*)_ps_coscof_p0;
739 y = _mm_mul_ps(y, z);
740 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p1);
741 y = _mm_mul_ps(y, z);
742 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p2);
743 y = _mm_mul_ps(y, z);
744 y = _mm_mul_ps(y, z);
745 v4sf tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
746 y = _mm_sub_ps(y, tmp);
747 y = _mm_add_ps(y, *(
v4sf*)_ps_1);
752 y2 = _mm_mul_ps(y2, z);
753 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p1);
754 y2 = _mm_mul_ps(y2, z);
755 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p2);
756 y2 = _mm_mul_ps(y2, z);
757 y2 = _mm_mul_ps(y2, x);
758 y2 = _mm_add_ps(y2, x);
762 v4sf ysin2 = _mm_and_ps(xmm3, y2);
763 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
764 y2 = _mm_sub_ps(y2,ysin2);
765 y = _mm_sub_ps(y, ysin1);
767 xmm1 = _mm_add_ps(ysin1,ysin2);
768 xmm2 = _mm_add_ps(y,y2);
771 *s = _mm_xor_ps(xmm1, sign_bit_sin);
772 *c = _mm_xor_ps(xmm2, sign_bit_cos);
union xmm_mm_union xmm_mm_union
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_)
#define _PS_CONST(Name, Val)
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_)
void sincos_ps(v4sf x, v4sf *s, v4sf *c)
#define _PI32_CONST(Name, Val)
#define _PS_CONST_TYPE(Name, Type, Val)
constexpr float coscof_p0
constexpr float minus_cephes_DP2
constexpr float sincof_p1
constexpr float cephes_FOPI
constexpr float sincof_p0
constexpr float coscof_p2
constexpr float minus_cephes_DP1
constexpr float sincof_p2
static const float inv_sign_mask
constexpr float minus_cephes_DP3
static const float sign_mask
constexpr float coscof_p1