24template <
class To,
class From>
26 sizeof(To) ==
sizeof(From) && std::is_trivially_copyable_v<From> &&
27 std::is_trivially_copyable_v<To>,
33 std::is_trivially_constructible_v<To>,
34 "This implementation additionally requires "
35 "destination type to be trivially constructible");
38 std::memcpy(&dst, &src,
sizeof(To));
42constexpr float PIF = 3.141592653589793238f;
43constexpr float PIO2F = 1.5707963267948966192f;
56constexpr float coscof_p1 = -1.388731625493765e-003f;
64static const float sign_mask = bit_cast<float>(0x80000000);
78 sign_bit = _mm_and_ps(sign_bit, _mm_set1_ps(
sign_mask));
82 __m128 cmp0 = _mm_cmpgt_ps(x, _mm_set1_ps(2.414213562373095f));
84 __m128 cmp1 = _mm_cmpgt_ps(x, _mm_set1_ps(0.4142135623730950f));
87 __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
91 __m128 x0 = _mm_div_ps(_mm_set1_ps(1.0f), x);
92 x0 = _mm_xor_ps(x0, _mm_set1_ps(
sign_mask));
96 __m128 x1_o = _mm_sub_ps(x, _mm_set1_ps(1.0f));
97 __m128 x1_u = _mm_add_ps(x, _mm_set1_ps(1.0f));
98 __m128 x1 = _mm_div_ps(x1_o, x1_u);
100 __m128 x2 = _mm_and_ps(cmp2, x1);
101 x0 = _mm_and_ps(cmp0, x0);
102 x2 = _mm_or_ps(x2, x0);
103 cmp1 = _mm_or_ps(cmp0, cmp2);
104 x2 = _mm_and_ps(cmp1, x2);
105 x = _mm_andnot_ps(cmp1, x);
106 x = _mm_or_ps(x2, x);
108 y = _mm_or_ps(y0, y1);
110 __m128 zz = _mm_mul_ps(x, x);
112 acc = _mm_mul_ps(acc, zz);
113 acc = _mm_sub_ps(acc, _mm_set1_ps(
atancof_p1));
114 acc = _mm_mul_ps(acc, zz);
115 acc = _mm_add_ps(acc, _mm_set1_ps(
atancof_p2));
116 acc = _mm_mul_ps(acc, zz);
117 acc = _mm_sub_ps(acc, _mm_set1_ps(
atancof_p3));
118 acc = _mm_mul_ps(acc, zz);
119 acc = _mm_mul_ps(acc, x);
120 acc = _mm_add_ps(acc, x);
121 y = _mm_add_ps(y, acc);
124 y = _mm_xor_ps(y, sign_bit);
133 __m128 zero = _mm_setzero_ps();
134 __m128 x_eq_0 = _mm_cmpeq_ps(x, zero);
135 __m128 x_gt_0 = _mm_cmpgt_ps(x, zero);
136 __m128 x_le_0 = _mm_cmple_ps(x, zero);
137 __m128 y_eq_0 = _mm_cmpeq_ps(y, zero);
138 __m128 x_lt_0 = _mm_cmplt_ps(x, zero);
139 __m128 y_lt_0 = _mm_cmplt_ps(y, zero);
141 __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
142 __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
143 zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
145 __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
146 __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, _mm_set1_ps(
sign_mask));
148 pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
149 pio2_result = _mm_and_ps(pio2_mask, pio2_result);
151 __m128 pi_mask = _mm_and_ps(y_eq_0, x_lt_0);
153 __m128 pi_result = _mm_and_ps(pi_mask,
pi);
155 __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
156 swap_sign_mask_offset =
157 _mm_and_ps(swap_sign_mask_offset, _mm_set1_ps(
sign_mask));
159 __m128 offset0 = _mm_setzero_ps();
161 offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
163 __m128 offset = _mm_andnot_ps(x_lt_0, offset0);
164 offset = _mm_and_ps(x_lt_0, offset1);
166 __m128 arg = _mm_div_ps(y, x);
167 __m128 atan_result =
atan_ps(arg);
168 atan_result = _mm_add_ps(atan_result, offset);
172 __m128 result = _mm_andnot_ps(zero_mask, pio2_result);
173 atan_result = _mm_andnot_ps(zero_mask, atan_result);
174 atan_result = _mm_andnot_ps(pio2_mask, atan_result);
175 result = _mm_or_ps(result, atan_result);
176 result = _mm_or_ps(result, pi_result);
184 __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
185 __m128i emm0, emm2, emm4;
191 sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_set1_ps(
sign_mask));
197 emm2 = _mm_cvttps_epi32(y);
200 emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
201 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
202 y = _mm_cvtepi32_ps(emm2);
207 emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4));
208 emm0 = _mm_slli_epi32(emm0, 29);
209 __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
212 emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
213 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
214 __m128 poly_mask = _mm_castsi128_ps(emm2);
221 xmm1 = _mm_mul_ps(y, xmm1);
222 xmm2 = _mm_mul_ps(y, xmm2);
223 xmm3 = _mm_mul_ps(y, xmm3);
224 x = _mm_add_ps(x, xmm1);
225 x = _mm_add_ps(x, xmm2);
226 x = _mm_add_ps(x, xmm3);
228 emm4 = _mm_sub_epi32(emm4, _mm_set1_epi32(2));
229 emm4 = _mm_andnot_si128(emm4, _mm_set1_epi32(4));
230 emm4 = _mm_slli_epi32(emm4, 29);
231 __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
233 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
236 __m128 z = _mm_mul_ps(x, x);
239 y = _mm_mul_ps(y, z);
240 y = _mm_add_ps(y, _mm_set1_ps(
coscof_p1));
241 y = _mm_mul_ps(y, z);
242 y = _mm_add_ps(y, _mm_set1_ps(
coscof_p2));
243 y = _mm_mul_ps(y, z);
244 y = _mm_mul_ps(y, z);
245 __m128 tmp = _mm_mul_ps(z, _mm_set1_ps(0.5f));
246 y = _mm_sub_ps(y, tmp);
247 y = _mm_add_ps(y, _mm_set1_ps(1));
252 y2 = _mm_mul_ps(y2, z);
253 y2 = _mm_add_ps(y2, _mm_set1_ps(
sincof_p1));
254 y2 = _mm_mul_ps(y2, z);
255 y2 = _mm_add_ps(y2, _mm_set1_ps(
sincof_p2));
256 y2 = _mm_mul_ps(y2, z);
257 y2 = _mm_mul_ps(y2, x);
258 y2 = _mm_add_ps(y2, x);
262 __m128 ysin2 = _mm_and_ps(xmm3, y2);
263 __m128 ysin1 = _mm_andnot_ps(xmm3, y);
264 y2 = _mm_sub_ps(y2, ysin2);
265 y = _mm_sub_ps(y, ysin1);
267 xmm1 = _mm_add_ps(ysin1, ysin2);
268 xmm2 = _mm_add_ps(y, y2);
271 return std::make_pair(
272 _mm_xor_ps(xmm1, sign_bit_sin), _mm_xor_ps(xmm2, sign_bit_cos));
277 return _mm_cvtss_f32(
atan2_ps(_mm_set_ss(y), _mm_set_ss(x)));
283 return std::make_pair(_mm_cvtss_f32(res.first), _mm_cvtss_f32(res.second));
286inline __m128
norm(__m128 x, __m128 y)
288 return _mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
293 __m128 sse_value = _mm_set_ss(x);
294 sse_value = _mm_sqrt_ss(sse_value);
295 return _mm_cvtss_f32(sse_value);
298template <
typename fnc>
300 const std::complex<float>* input,
float* output,
int n,
const fnc& f)
302 for (
int i = 0; i <= n - 4; i += 4)
305 auto p1 = _mm_load_ps(
reinterpret_cast<const float*
>(input + i));
306 auto p2 = _mm_load_ps(
reinterpret_cast<const float*
>(input + i + 2));
311 auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
312 auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));
317 _mm_store_ps(output + i, out);
320 for (
int i = n & (~3); i < n; ++i)
323 f(_mm_set_ss(real(input[i])), _mm_set_ss(imag(input[i])), out);
324 output[i] = _mm_cvtss_f32(out);
329 const float* oldPhase,
const float* newPhase, std::complex<float>* output,
332 for (
int i = 0; i <= n - 4; i += 4)
336 _mm_sub_ps(_mm_load_ps(newPhase + i), _mm_load_ps(oldPhase + i)) :
337 _mm_load_ps(newPhase + i));
340 auto p1 = _mm_load_ps(
reinterpret_cast<float*
>(output + i));
341 auto p2 = _mm_load_ps(
reinterpret_cast<float*
>(output + i + 2));
346 auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
347 auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));
352 auto out_rp = _mm_sub_ps(_mm_mul_ps(rp, cos), _mm_mul_ps(ip, sin));
353 auto out_ip = _mm_add_ps(_mm_mul_ps(rp, sin), _mm_mul_ps(ip, cos));
355 p1 = _mm_unpacklo_ps(out_rp, out_ip);
356 p2 = _mm_unpackhi_ps(out_rp, out_ip);
358 _mm_store_ps(
reinterpret_cast<float*
>(output + i), p1);
359 _mm_store_ps(
reinterpret_cast<float*
>(output + i + 2), p2);
362 for (
int i = n & (~3); i < n; ++i)
364 const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i];
365 output[i] *= std::complex<float>(cosf(theta), sinf(theta));
constexpr float coscof_p0
constexpr float atancof_p2
constexpr float atancof_p3
constexpr float minus_cephes_DP2
constexpr float sincof_p1
constexpr float cephes_FOPI
constexpr float cephes_PIO2F
constexpr float cephes_PIF
constexpr float sincof_p0
constexpr float atancof_p0
constexpr float atancof_p1
constexpr float coscof_p2
constexpr float minus_cephes_DP1
constexpr float sincof_p2
std::enable_if_t< sizeof(To)==sizeof(From) &&std::is_trivially_copyable_v< From > &&std::is_trivially_copyable_v< To >, To > bit_cast(const From &src) noexcept
constexpr float cephes_PIO4F
static const float inv_sign_mask
constexpr float minus_cephes_DP3
static const float sign_mask
constexpr float coscof_p1
__m128 atan2_ps(__m128 y, __m128 x)
std::pair< __m128, __m128 > sincos_ps(__m128 x)
float atan2_ss(float y, float x)
__m128 norm(__m128 x, __m128 y)
void perform_parallel_simd_aligned(const std::complex< float > *input, float *output, int n, const fnc &f)
std::pair< float, float > sincos_ss(float angle)
void rotate_parallel_simd_aligned(const float *oldPhase, const float *newPhase, std::complex< float > *output, int n)