13#ifndef GDALSSE_PRIV_H_INCLUDED
14#define GDALSSE_PRIV_H_INCLUDED
22#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
23 !defined(USE_SSE2_EMULATION)
27#ifdef USE_NEON_OPTIMIZATIONS
28#include "include_sse2neon.h"
33#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
38#include "gdal_priv_templates.hpp"
40static inline __m128i GDALCopyInt16ToXMM(
const void *ptr)
44 return _mm_cvtsi32_si128(s);
47static inline __m128i GDALCopyInt32ToXMM(
const void *ptr)
51 return _mm_cvtsi32_si128(i);
54static inline __m128i GDALCopyInt64ToXMM(
const void *ptr)
56#if defined(__i386__) || defined(_M_IX86)
57 return _mm_loadl_epi64(
static_cast<const __m128i *
>(ptr));
61 return _mm_cvtsi64_si128(i);
65#ifndef GDALCopyXMMToInt16_defined
66#define GDALCopyXMMToInt16_defined
68static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void *pDest)
70 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
86 : xmm(_mm_undefined_ps())
91 XMMReg4Float(
const XMMReg4Float &other) : xmm(other.xmm)
95 static inline XMMReg4Float Zero()
102 static inline XMMReg4Float Set1(
float f)
105 reg.xmm = _mm_set1_ps(f);
109 static inline XMMReg4Float LoadAllVal(
const float *ptr)
111 return Load4Val(ptr);
114 static inline XMMReg4Float Load4Val(
const float *ptr)
121 static inline XMMReg4Float Load4Val(
const unsigned char *ptr)
128 static inline XMMReg4Float Load4Val(
const short *ptr)
135 static inline XMMReg4Float Load4Val(
const unsigned short *ptr)
142 static inline XMMReg4Float Load4Val(
const int *ptr)
149 static inline XMMReg4Float Equals(
const XMMReg4Float &expr1,
150 const XMMReg4Float &expr2)
153 reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);
157 static inline XMMReg4Float NotEquals(
const XMMReg4Float &expr1,
158 const XMMReg4Float &expr2)
161 reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);
165 static inline XMMReg4Float Lesser(
const XMMReg4Float &expr1,
166 const XMMReg4Float &expr2)
169 reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);
173 static inline XMMReg4Float Greater(
const XMMReg4Float &expr1,
174 const XMMReg4Float &expr2)
177 reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);
181 static inline XMMReg4Float And(
const XMMReg4Float &expr1,
182 const XMMReg4Float &expr2)
185 reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);
189 static inline XMMReg4Float Ternary(
const XMMReg4Float &cond,
190 const XMMReg4Float &true_expr,
191 const XMMReg4Float &false_expr)
194 reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),
195 _mm_andnot_ps(cond.xmm, false_expr.xmm));
199 static inline XMMReg4Float Min(
const XMMReg4Float &expr1,
200 const XMMReg4Float &expr2)
203 reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);
207 static inline XMMReg4Float Max(
const XMMReg4Float &expr1,
208 const XMMReg4Float &expr2)
211 reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);
215 inline void nsLoad4Val(
const float *ptr)
217 xmm = _mm_loadu_ps(ptr);
220 static inline void Load16Val(
const float *ptr, XMMReg4Float &r0,
221 XMMReg4Float &r1, XMMReg4Float &r2,
225 r1.nsLoad4Val(ptr + 4);
226 r2.nsLoad4Val(ptr + 8);
227 r3.nsLoad4Val(ptr + 12);
230 inline void nsLoad4Val(
const int *ptr)
232 const __m128i xmm_i =
233 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
234 xmm = _mm_cvtepi32_ps(xmm_i);
237 static inline void Load16Val(
const int *ptr, XMMReg4Float &r0,
238 XMMReg4Float &r1, XMMReg4Float &r2,
242 r1.nsLoad4Val(ptr + 4);
243 r2.nsLoad4Val(ptr + 8);
244 r3.nsLoad4Val(ptr + 12);
247 static inline __m128i cvtepu8_epi32(__m128i x)
249#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
250 return _mm_cvtepu8_epi32(x);
252 return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),
253 _mm_setzero_si128());
257 inline void nsLoad4Val(
const unsigned char *ptr)
259 const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
260 xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
263 static inline void Load8Val(
const unsigned char *ptr, XMMReg4Float &r0,
266 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
267 r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
268 r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
271 static inline void Load16Val(
const unsigned char *ptr, XMMReg4Float &r0,
272 XMMReg4Float &r1, XMMReg4Float &r2,
275 const __m128i xmm_i =
276 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
277 r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
278 r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
279 r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));
280 r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));
283 static inline __m128i cvtepi16_epi32(__m128i x)
285#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
286 return _mm_cvtepi16_epi32(x);
289 return _mm_srai_epi32(
291 _mm_unpacklo_epi16(x, x), 16);
295 inline void nsLoad4Val(
const short *ptr)
297 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
298 xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
301 static inline void Load8Val(
const short *ptr, XMMReg4Float &r0,
304 const __m128i xmm_i =
305 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
306 r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
307 r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));
310 static inline void Load16Val(
const short *ptr, XMMReg4Float &r0,
311 XMMReg4Float &r1, XMMReg4Float &r2,
314 Load8Val(ptr, r0, r1);
315 Load8Val(ptr + 8, r2, r3);
318 static inline __m128i cvtepu16_epi32(__m128i x)
320#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
321 return _mm_cvtepu16_epi32(x);
323 return _mm_unpacklo_epi16(
324 x, _mm_setzero_si128());
328 inline void nsLoad4Val(
const unsigned short *ptr)
330 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
331 xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
334 static inline void Load8Val(
const unsigned short *ptr, XMMReg4Float &r0,
337 const __m128i xmm_i =
338 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
339 r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
340 r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));
343 static inline void Load16Val(
const unsigned short *ptr, XMMReg4Float &r0,
344 XMMReg4Float &r1, XMMReg4Float &r2,
347 Load8Val(ptr, r0, r1);
348 Load8Val(ptr + 8, r2, r3);
351 inline void Zeroize()
353 xmm = _mm_setzero_ps();
356 inline XMMReg4Float &operator=(
const XMMReg4Float &other)
362 inline XMMReg4Float &operator+=(
const XMMReg4Float &other)
364 xmm = _mm_add_ps(xmm, other.xmm);
368 inline XMMReg4Float &operator-=(
const XMMReg4Float &other)
370 xmm = _mm_sub_ps(xmm, other.xmm);
374 inline XMMReg4Float &operator*=(
const XMMReg4Float &other)
376 xmm = _mm_mul_ps(xmm, other.xmm);
380 inline XMMReg4Float operator+(
const XMMReg4Float &other)
const
383 ret.xmm = _mm_add_ps(xmm, other.xmm);
387 inline XMMReg4Float operator-(
const XMMReg4Float &other)
const
390 ret.xmm = _mm_sub_ps(xmm, other.xmm);
394 inline XMMReg4Float operator*(
const XMMReg4Float &other)
const
397 ret.xmm = _mm_mul_ps(xmm, other.xmm);
401 inline XMMReg4Float operator/(
const XMMReg4Float &other)
const
404 ret.xmm = _mm_div_ps(xmm, other.xmm);
408 inline XMMReg4Float inverse()
const
411 ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);
415 inline XMMReg4Int truncate_to_int()
const;
417 inline XMMReg4Float cast_to_float()
const
422 inline XMMReg4Double cast_to_double()
const;
424 inline XMMReg4Float approx_inv_sqrt(
const XMMReg4Float &one,
425 const XMMReg4Float &half)
const
428 __m128 reg_half = _mm_mul_ps(reg, half.xmm);
430 reg = _mm_rsqrt_ps(reg);
434 const __m128 one_and_a_half = _mm_add_ps(one.xmm, half.xmm);
436 reg, _mm_sub_ps(one_and_a_half,
437 _mm_mul_ps(reg_half, _mm_mul_ps(reg, reg))));
443 inline void StoreAllVal(
float *ptr)
const
448 inline void Store4Val(
float *ptr)
const
450 _mm_storeu_ps(ptr, xmm);
453 inline void Store4ValAligned(
float *ptr)
const
455 _mm_store_ps(ptr, xmm);
458 inline operator float()
const
460 return _mm_cvtss_f32(xmm);
470#if !defined(_MSC_VER)
471 : xmm(_mm_undefined_si128())
476 XMMReg4Int(
const XMMReg4Int &other) : xmm(other.xmm)
480 inline XMMReg4Int &operator=(
const XMMReg4Int &other)
486 static inline XMMReg4Int Zero()
489 reg.xmm = _mm_setzero_si128();
493 static inline XMMReg4Int Set1(
int i)
496 reg.xmm = _mm_set1_epi32(i);
500 static inline XMMReg4Int LoadAllVal(
const int *ptr)
502 return Load4Val(ptr);
505 static inline XMMReg4Int Load4Val(
const int *ptr)
512 inline void nsLoad4Val(
const int *ptr)
514 xmm = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
517 static inline XMMReg4Int Equals(
const XMMReg4Int &expr1,
518 const XMMReg4Int &expr2)
521 reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);
525 static inline XMMReg4Int Ternary(
const XMMReg4Int &cond,
526 const XMMReg4Int &true_expr,
527 const XMMReg4Int &false_expr)
530 reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
531 _mm_andnot_si128(cond.xmm, false_expr.xmm));
535 inline XMMReg4Int &operator+=(
const XMMReg4Int &other)
537 xmm = _mm_add_epi32(xmm, other.xmm);
541 inline XMMReg4Int &operator-=(
const XMMReg4Int &other)
543 xmm = _mm_sub_epi32(xmm, other.xmm);
547 inline XMMReg4Int operator+(
const XMMReg4Int &other)
const
550 ret.xmm = _mm_add_epi32(xmm, other.xmm);
554 inline XMMReg4Int operator-(
const XMMReg4Int &other)
const
557 ret.xmm = _mm_sub_epi32(xmm, other.xmm);
561 XMMReg4Double cast_to_double()
const;
563 XMMReg4Float cast_to_float()
const
566 ret.xmm = _mm_cvtepi32_ps(xmm);
571inline XMMReg4Int XMMReg4Float::truncate_to_int()
const
574 ret.xmm = _mm_cvttps_epi32(xmm);
584#if !defined(_MSC_VER)
585 : xmm(_mm_undefined_si128())
590 XMMReg8Byte(
const XMMReg8Byte &other) : xmm(other.xmm)
594 static inline XMMReg8Byte Zero()
597 reg.xmm = _mm_setzero_si128();
601 static inline XMMReg8Byte Set1(
char i)
604 reg.xmm = _mm_set1_epi8(i);
608 static inline XMMReg8Byte Equals(
const XMMReg8Byte &expr1,
609 const XMMReg8Byte &expr2)
612 reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);
616 static inline XMMReg8Byte Or(
const XMMReg8Byte &expr1,
617 const XMMReg8Byte &expr2)
620 reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);
624 static inline XMMReg8Byte Ternary(
const XMMReg8Byte &cond,
625 const XMMReg8Byte &true_expr,
626 const XMMReg8Byte &false_expr)
629 reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
630 _mm_andnot_si128(cond.xmm, false_expr.xmm));
634 inline XMMReg8Byte operator+(
const XMMReg8Byte &other)
const
637 ret.xmm = _mm_add_epi8(xmm, other.xmm);
641 inline XMMReg8Byte operator-(
const XMMReg8Byte &other)
const
644 ret.xmm = _mm_sub_epi8(xmm, other.xmm);
648 static inline XMMReg8Byte Pack(
const XMMReg4Int &r0,
const XMMReg4Int &r1)
651 reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);
652 reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);
656 inline void Store8Val(
unsigned char *ptr)
const
658 GDALCopyXMMToInt64(xmm,
reinterpret_cast<GInt64 *
>(ptr));
668#if !defined(_MSC_VER)
669 : xmm(_mm_undefined_pd())
674 XMMReg2Double(
double val) : xmm(_mm_load_sd(&val))
678 XMMReg2Double(
const XMMReg2Double &other) : xmm(other.xmm)
682 static inline XMMReg2Double Set1(
double d)
685 reg.xmm = _mm_set1_pd(d);
689 static inline XMMReg2Double Zero()
696 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
699 reg.nsLoad1ValHighAndLow(ptr);
703 static inline XMMReg2Double Load2Val(
const double *ptr)
710 static inline XMMReg2Double Load2Val(
const float *ptr)
717 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
720 reg.nsLoad2ValAligned(ptr);
724 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
731 static inline XMMReg2Double Load2Val(
const short *ptr)
738 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
745 static inline XMMReg2Double Load2Val(
const int *ptr)
752 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
753 const XMMReg2Double &expr2)
756 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
760 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
761 const XMMReg2Double &expr2)
764 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
768 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
769 const XMMReg2Double &expr2)
772 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
776 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
777 const XMMReg2Double &expr2)
780 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
784 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
785 const XMMReg2Double &true_expr,
786 const XMMReg2Double &false_expr)
789 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
790 _mm_andnot_pd(cond.xmm, false_expr.xmm));
794 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
795 const XMMReg2Double &expr2)
798 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
802 inline void nsLoad1ValHighAndLow(
const double *ptr)
804 xmm = _mm_load1_pd(ptr);
807 inline void nsLoad2Val(
const double *ptr)
809 xmm = _mm_loadu_pd(ptr);
812 inline void nsLoad2ValAligned(
const double *ptr)
814 xmm = _mm_load_pd(ptr);
817 inline void nsLoad2Val(
const float *ptr)
819 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
822 inline void nsLoad2Val(
const int *ptr)
824 xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));
827 inline void nsLoad2Val(
const unsigned char *ptr)
829 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
830#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
831 xmm_i = _mm_cvtepu8_epi32(xmm_i);
833 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
834 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
836 xmm = _mm_cvtepi32_pd(xmm_i);
839 inline void nsLoad2Val(
const short *ptr)
841 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
842#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
843 xmm_i = _mm_cvtepi16_epi32(xmm_i);
845 xmm_i = _mm_unpacklo_epi16(
847 xmm_i = _mm_srai_epi32(
850 xmm = _mm_cvtepi32_pd(xmm_i);
853 inline void nsLoad2Val(
const unsigned short *ptr)
855 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
856#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
857 xmm_i = _mm_cvtepu16_epi32(xmm_i);
859 xmm_i = _mm_unpacklo_epi16(
861 _mm_setzero_si128());
863 xmm = _mm_cvtepi32_pd(xmm_i);
866 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
869 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
870#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
871 xmm_i = _mm_cvtepu8_epi32(xmm_i);
873 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
874 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
876 low.xmm = _mm_cvtepi32_pd(xmm_i);
878 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
881 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
885 high.nsLoad2Val(ptr + 2);
888 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
892 high.nsLoad2Val(ptr + 2);
895 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
899 high.nsLoad2Val(ptr + 2);
902 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
905 __m128 temp1 = _mm_loadu_ps(ptr);
906 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
907 low.xmm = _mm_cvtps_pd(temp1);
908 high.xmm = _mm_cvtps_pd(temp2);
911 inline void Zeroize()
913 xmm = _mm_setzero_pd();
916 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
922 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
924 xmm = _mm_add_pd(xmm, other.xmm);
928 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
930 xmm = _mm_mul_pd(xmm, other.xmm);
934 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
937 ret.xmm = _mm_add_pd(xmm, other.xmm);
941 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
944 ret.xmm = _mm_sub_pd(xmm, other.xmm);
948 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
951 ret.xmm = _mm_mul_pd(xmm, other.xmm);
955 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
958 ret.xmm = _mm_div_pd(xmm, other.xmm);
962 inline double GetHorizSum()
const
965 xmm2 = _mm_shuffle_pd(
968 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
971 inline void Store2Val(
double *ptr)
const
973 _mm_storeu_pd(ptr, xmm);
976 inline void Store2ValAligned(
double *ptr)
const
978 _mm_store_pd(ptr, xmm);
981 inline void Store2Val(
float *ptr)
const
983 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
984 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
987 inline void Store2Val(
unsigned char *ptr)
const
989 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
992 tmp = _mm_packs_epi32(tmp, tmp);
993 tmp = _mm_packus_epi16(tmp, tmp);
994 GDALCopyXMMToInt16(tmp,
reinterpret_cast<GInt16 *
>(ptr));
997 inline void Store2Val(
unsigned short *ptr)
const
999 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
1003 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
1004 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
1007 inline void StoreMask(
unsigned char *ptr)
const
1009 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(ptr),
1010 _mm_castpd_si128(xmm));
1013 inline operator double()
const
1015 return _mm_cvtsd_f64(xmm);
1021#ifndef NO_WARN_USE_SSE2_EMULATION
1022#warning "Software emulation of SSE2 !"
1032 XMMReg2Double() =
default;
1034 explicit XMMReg2Double(
double val)
1040 XMMReg2Double(
const XMMReg2Double &other) : low(other.low), high(other.high)
1044 static inline XMMReg2Double Zero()
1051 static inline XMMReg2Double Set1(
double d)
1059 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
1062 reg.nsLoad1ValHighAndLow(ptr);
1066 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
1067 const XMMReg2Double &expr2)
1071 if (expr1.low == expr2.low)
1072 memset(&(reg.low), 0xFF,
sizeof(
double));
1076 if (expr1.high == expr2.high)
1077 memset(&(reg.high), 0xFF,
sizeof(
double));
1084 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
1085 const XMMReg2Double &expr2)
1089 if (expr1.low != expr2.low)
1090 memset(&(reg.low), 0xFF,
sizeof(
double));
1094 if (expr1.high != expr2.high)
1095 memset(&(reg.high), 0xFF,
sizeof(
double));
1102 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
1103 const XMMReg2Double &expr2)
1107 if (expr1.low > expr2.low)
1108 memset(&(reg.low), 0xFF,
sizeof(
double));
1112 if (expr1.high > expr2.high)
1113 memset(&(reg.high), 0xFF,
sizeof(
double));
1120 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
1121 const XMMReg2Double &expr2)
1124 int low1[2], high1[2];
1125 int low2[2], high2[2];
1126 memcpy(low1, &expr1.low,
sizeof(
double));
1127 memcpy(high1, &expr1.high,
sizeof(
double));
1128 memcpy(low2, &expr2.low,
sizeof(
double));
1129 memcpy(high2, &expr2.high,
sizeof(
double));
1132 high1[0] &= high2[0];
1133 high1[1] &= high2[1];
1134 memcpy(®.low, low1,
sizeof(
double));
1135 memcpy(®.high, high1,
sizeof(
double));
1139 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
1140 const XMMReg2Double &true_expr,
1141 const XMMReg2Double &false_expr)
1145 reg.low = true_expr.low;
1147 reg.low = false_expr.low;
1149 reg.high = true_expr.high;
1151 reg.high = false_expr.high;
1155 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
1156 const XMMReg2Double &expr2)
1159 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
1160 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
1164 static inline XMMReg2Double Load2Val(
const double *ptr)
1167 reg.nsLoad2Val(ptr);
1171 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
1174 reg.nsLoad2ValAligned(ptr);
1178 static inline XMMReg2Double Load2Val(
const float *ptr)
1181 reg.nsLoad2Val(ptr);
1185 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
1188 reg.nsLoad2Val(ptr);
1192 static inline XMMReg2Double Load2Val(
const short *ptr)
1195 reg.nsLoad2Val(ptr);
1199 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
1202 reg.nsLoad2Val(ptr);
1206 static inline XMMReg2Double Load2Val(
const int *ptr)
1209 reg.nsLoad2Val(ptr);
1213 inline void nsLoad1ValHighAndLow(
const double *ptr)
1219 inline void nsLoad2Val(
const double *ptr)
1225 inline void nsLoad2ValAligned(
const double *ptr)
1231 inline void nsLoad2Val(
const float *ptr)
1237 inline void nsLoad2Val(
const unsigned char *ptr)
1243 inline void nsLoad2Val(
const short *ptr)
1249 inline void nsLoad2Val(
const unsigned short *ptr)
1255 inline void nsLoad2Val(
const int *ptr)
1261 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
1262 XMMReg2Double &high)
1270 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
1271 XMMReg2Double &high)
1273 low.nsLoad2Val(ptr);
1274 high.nsLoad2Val(ptr + 2);
1277 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
1278 XMMReg2Double &high)
1280 low.nsLoad2Val(ptr);
1281 high.nsLoad2Val(ptr + 2);
1284 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
1285 XMMReg2Double &high)
1287 low.nsLoad2Val(ptr);
1288 high.nsLoad2Val(ptr + 2);
1291 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
1292 XMMReg2Double &high)
1294 low.nsLoad2Val(ptr);
1295 high.nsLoad2Val(ptr + 2);
1298 inline void Zeroize()
1304 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
1311 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
1318 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
1325 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
1328 ret.low = low + other.low;
1329 ret.high = high + other.high;
1333 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
1336 ret.low = low - other.low;
1337 ret.high = high - other.high;
1341 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
1344 ret.low = low * other.low;
1345 ret.high = high * other.high;
1349 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
1352 ret.low = low / other.low;
1353 ret.high = high / other.high;
1357 inline double GetHorizSum()
const
1362 inline void Store2Val(
double *ptr)
const
1368 inline void Store2ValAligned(
double *ptr)
const
1374 inline void Store2Val(
float *ptr)
const
1376 ptr[0] =
static_cast<float>(low);
1377 ptr[1] =
static_cast<float>(high);
1380 void Store2Val(
unsigned char *ptr)
const
1382 ptr[0] =
static_cast<unsigned char>(low + 0.5);
1383 ptr[1] =
static_cast<unsigned char>(high + 0.5);
1386 void Store2Val(
unsigned short *ptr)
const
1388 ptr[0] =
static_cast<GUInt16>(low + 0.5);
1389 ptr[1] =
static_cast<GUInt16>(high + 0.5);
1392 inline void StoreMask(
unsigned char *ptr)
const
1394 memcpy(ptr, &low, 8);
1395 memcpy(ptr + 8, &high, 8);
1398 inline operator double()
const
1406#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
1408#include <immintrin.h>
1415 XMMReg4Double() : ymm(_mm256_setzero_pd())
1419 XMMReg4Double(
const XMMReg4Double &other) : ymm(other.ymm)
1423 static inline XMMReg4Double Zero()
1430 static inline XMMReg4Double Set1(
double d)
1433 reg.ymm = _mm256_set1_pd(d);
1437 inline void Zeroize()
1439 ymm = _mm256_setzero_pd();
1442 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
1445 reg.nsLoad1ValHighAndLow(ptr);
1449 inline void nsLoad1ValHighAndLow(
const double *ptr)
1451 ymm = _mm256_set1_pd(*ptr);
1454 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
1457 reg.nsLoad4Val(ptr);
1461 inline void nsLoad4Val(
const unsigned char *ptr)
1463 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
1464 xmm_i = _mm_cvtepu8_epi32(xmm_i);
1465 ymm = _mm256_cvtepi32_pd(xmm_i);
1468 static inline void Load8Val(
const unsigned char *ptr, XMMReg4Double &low,
1469 XMMReg4Double &high)
1471 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1472 const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);
1473 low.ymm = _mm256_cvtepi32_pd(xmm_i_low);
1474 const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));
1475 high.ymm = _mm256_cvtepi32_pd(xmm_i_high);
1478 static inline XMMReg4Double Load4Val(
const short *ptr)
1481 reg.nsLoad4Val(ptr);
1485 inline void nsLoad4Val(
const short *ptr)
1487 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1488 xmm_i = _mm_cvtepi16_epi32(xmm_i);
1489 ymm = _mm256_cvtepi32_pd(xmm_i);
1492 static inline void Load8Val(
const short *ptr, XMMReg4Double &low,
1493 XMMReg4Double &high)
1495 low.nsLoad4Val(ptr);
1496 high.nsLoad4Val(ptr + 4);
1499 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
1502 reg.nsLoad4Val(ptr);
1506 inline void nsLoad4Val(
const unsigned short *ptr)
1508 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1509 xmm_i = _mm_cvtepu16_epi32(xmm_i);
1510 ymm = _mm256_cvtepi32_pd(
1515 static inline void Load8Val(
const unsigned short *ptr, XMMReg4Double &low,
1516 XMMReg4Double &high)
1518 low.nsLoad4Val(ptr);
1519 high.nsLoad4Val(ptr + 4);
1522 static inline XMMReg4Double Load4Val(
const double *ptr)
1525 reg.nsLoad4Val(ptr);
1529 inline void nsLoad4Val(
const double *ptr)
1531 ymm = _mm256_loadu_pd(ptr);
1534 static inline void Load8Val(
const double *ptr, XMMReg4Double &low,
1535 XMMReg4Double &high)
1537 low.nsLoad4Val(ptr);
1538 high.nsLoad4Val(ptr + 4);
1541 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
1544 reg.nsLoad4ValAligned(ptr);
1548 inline void nsLoad4ValAligned(
const double *ptr)
1550 ymm = _mm256_load_pd(ptr);
1553 static inline XMMReg4Double Load4Val(
const float *ptr)
1556 reg.nsLoad4Val(ptr);
1560 inline void nsLoad4Val(
const float *ptr)
1562 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
1565 static inline void Load8Val(
const float *ptr, XMMReg4Double &low,
1566 XMMReg4Double &high)
1568 low.nsLoad4Val(ptr);
1569 high.nsLoad4Val(ptr + 4);
1572 static inline XMMReg4Double Load4Val(
const int *ptr)
1575 reg.nsLoad4Val(ptr);
1579 inline void nsLoad4Val(
const int *ptr)
1581 ymm = _mm256_cvtepi32_pd(
1582 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr)));
1585 static inline void Load8Val(
const int *ptr, XMMReg4Double &low,
1586 XMMReg4Double &high)
1588 low.nsLoad4Val(ptr);
1589 high.nsLoad4Val(ptr + 4);
1592 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
1593 const XMMReg4Double &expr2)
1596 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
1600 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
1601 const XMMReg4Double &expr2)
1604 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
1608 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
1609 const XMMReg4Double &expr2)
1612 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
1616 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
1617 const XMMReg4Double &expr2)
1620 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
1624 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
1625 const XMMReg4Double &true_expr,
1626 const XMMReg4Double &false_expr)
1629 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
1630 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
1634 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
1635 const XMMReg4Double &expr2)
1638 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
1642 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
1648 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
1650 ymm = _mm256_add_pd(ymm, other.ymm);
1654 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
1656 ymm = _mm256_mul_pd(ymm, other.ymm);
1660 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
1663 ret.ymm = _mm256_add_pd(ymm, other.ymm);
1667 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
1670 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1674 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
1677 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1681 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1684 ret.ymm = _mm256_div_pd(ymm, other.ymm);
1688 void AddToLow(
const XMMReg2Double &other)
1690 __m256d ymm2 = _mm256_setzero_pd();
1691 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1692 ymm = _mm256_add_pd(ymm, ymm2);
1695 inline double GetHorizSum()
const
1697 __m256d ymm_tmp1, ymm_tmp2;
1698 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1699 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1700 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1701 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1704 inline XMMReg4Double approx_inv_sqrt(
const XMMReg4Double &one,
1705 const XMMReg4Double &half)
const
1708 __m256d reg_half = _mm256_mul_pd(reg, half.ymm);
1710 reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));
1714 const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);
1715 reg = _mm256_mul_pd(
1717 _mm256_sub_pd(one_and_a_half,
1718 _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));
1724 inline XMMReg4Float cast_to_float()
const
1727 ret.xmm = _mm256_cvtpd_ps(ymm);
1731 inline void Store4Val(
unsigned char *ptr)
const
1734 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1738 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1740 GDALCopyXMMToInt32(xmm_i,
reinterpret_cast<GInt32 *
>(ptr));
1743 inline void Store4Val(
unsigned short *ptr)
const
1746 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1747 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
1748 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
1751 inline void Store4Val(
float *ptr)
const
1753 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1756 inline void Store4Val(
double *ptr)
const
1758 _mm256_storeu_pd(ptr, ymm);
1761 inline void StoreMask(
unsigned char *ptr)
const
1763 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(ptr),
1764 _mm256_castpd_si256(ymm));
1768inline XMMReg4Double XMMReg4Float::cast_to_double()
const
1771 ret.ymm = _mm256_cvtps_pd(xmm);
1775inline XMMReg4Double XMMReg4Int::cast_to_double()
const
1778 ret.ymm = _mm256_cvtepi32_pd(xmm);
1788#if !defined(_MSC_VER)
1789 : ymm(_mm256_undefined_ps())
1794 XMMReg8Float(
const XMMReg8Float &other) : ymm(other.ymm)
1798 static inline XMMReg8Float Set1(
float f)
1801 reg.ymm = _mm256_set1_ps(f);
1805 static inline XMMReg8Float LoadAllVal(
const float *ptr)
1807 return Load8Val(ptr);
1810 static inline XMMReg8Float Load8Val(
const float *ptr)
1813 reg.nsLoad8Val(ptr);
1817 static inline XMMReg8Float Load8Val(
const int *ptr)
1820 reg.nsLoad8Val(ptr);
1824 static inline XMMReg8Float Max(
const XMMReg8Float &expr1,
1825 const XMMReg8Float &expr2)
1828 reg.ymm = _mm256_max_ps(expr1.ymm, expr2.ymm);
1832 inline void nsLoad8Val(
const float *ptr)
1834 ymm = _mm256_loadu_ps(ptr);
1837 inline void nsLoad8Val(
const int *ptr)
1839 const __m256i ymm_i =
1840 _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(ptr));
1841 ymm = _mm256_cvtepi32_ps(ymm_i);
1844 inline XMMReg8Float &operator=(
const XMMReg8Float &other)
1850 inline XMMReg8Float &operator+=(
const XMMReg8Float &other)
1852 ymm = _mm256_add_ps(ymm, other.ymm);
1856 inline XMMReg8Float &operator-=(
const XMMReg8Float &other)
1858 ymm = _mm256_sub_ps(ymm, other.ymm);
1862 inline XMMReg8Float &operator*=(
const XMMReg8Float &other)
1864 ymm = _mm256_mul_ps(ymm, other.ymm);
1868 inline XMMReg8Float operator+(
const XMMReg8Float &other)
const
1871 ret.ymm = _mm256_add_ps(ymm, other.ymm);
1875 inline XMMReg8Float operator-(
const XMMReg8Float &other)
const
1878 ret.ymm = _mm256_sub_ps(ymm, other.ymm);
1882 inline XMMReg8Float operator*(
const XMMReg8Float &other)
const
1885 ret.ymm = _mm256_mul_ps(ymm, other.ymm);
1889 inline XMMReg8Float operator/(
const XMMReg8Float &other)
const
1892 ret.ymm = _mm256_div_ps(ymm, other.ymm);
1896 inline XMMReg8Float inverse()
const
1899 ret.ymm = _mm256_div_ps(_mm256_set1_ps(1.0f), ymm);
1903 inline XMMReg8Float approx_inv_sqrt(
const XMMReg8Float &one,
1904 const XMMReg8Float &half)
const
1907 __m256 reg_half = _mm256_mul_ps(reg, half.ymm);
1909 reg = _mm256_rsqrt_ps(reg);
1913 const __m256 one_and_a_half = _mm256_add_ps(one.ymm, half.ymm);
1914 reg = _mm256_mul_ps(
1916 _mm256_sub_ps(one_and_a_half,
1917 _mm256_mul_ps(reg_half, _mm256_mul_ps(reg, reg))));
1923 inline void StoreAllVal(
float *ptr)
const
1928 inline void Store8Val(
float *ptr)
const
1930 _mm256_storeu_ps(ptr, ymm);
1933 XMMReg8Float cast_to_float()
const
1939#if defined(__AVX2__)
1947#if !defined(_MSC_VER)
1948 : ymm(_mm256_undefined_si256())
1953 XMMReg8Int(
const XMMReg8Int &other) : ymm(other.ymm)
1957 inline XMMReg8Int &operator=(
const XMMReg8Int &other)
1963 static inline XMMReg8Int Zero()
1966 reg.ymm = _mm256_setzero_si256();
1970 static inline XMMReg8Int Set1(
int i)
1973 reg.ymm = _mm256_set1_epi32(i);
1977 static inline XMMReg8Int LoadAllVal(
const int *ptr)
1979 return Load8Val(ptr);
1982 static inline XMMReg8Int Load8Val(
const int *ptr)
1985 reg.nsLoad8Val(ptr);
1989 inline void nsLoad8Val(
const int *ptr)
1991 ymm = _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(ptr));
1994 static inline XMMReg8Int Equals(
const XMMReg8Int &expr1,
1995 const XMMReg8Int &expr2)
1998 reg.ymm = _mm256_cmpeq_epi32(expr1.ymm, expr2.ymm);
2002 static inline XMMReg8Int Ternary(
const XMMReg8Int &cond,
2003 const XMMReg8Int &true_expr,
2004 const XMMReg8Int &false_expr)
2008 _mm256_or_si256(_mm256_and_si256(cond.ymm, true_expr.ymm),
2009 _mm256_andnot_si256(cond.ymm, false_expr.ymm));
2013 inline XMMReg8Int &operator+=(
const XMMReg8Int &other)
2015 ymm = _mm256_add_epi32(ymm, other.ymm);
2019 inline XMMReg8Int &operator-=(
const XMMReg8Int &other)
2021 ymm = _mm256_sub_epi32(ymm, other.ymm);
2025 inline XMMReg8Int operator+(
const XMMReg8Int &other)
const
2028 ret.ymm = _mm256_add_epi32(ymm, other.ymm);
2032 inline XMMReg8Int operator-(
const XMMReg8Int &other)
const
2035 ret.ymm = _mm256_sub_epi32(ymm, other.ymm);
2039 XMMReg8Float cast_to_float()
const
2042 ret.ymm = _mm256_cvtepi32_ps(ymm);
2054 XMMReg2Double low, high;
2056 XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())
2060 XMMReg4Double(
const XMMReg4Double &other) : low(other.low), high(other.high)
2064 static inline XMMReg4Double Zero()
2072 static inline XMMReg4Double Set1(
double d)
2075 reg.low = XMMReg2Double::Set1(d);
2076 reg.high = XMMReg2Double::Set1(d);
2080 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
2083 reg.low.nsLoad1ValHighAndLow(ptr);
2088 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
2091 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
2095 static inline void Load8Val(
const unsigned char *ptr, XMMReg4Double &low,
2096 XMMReg4Double &high)
2098 low = Load4Val(ptr);
2099 high = Load4Val(ptr + 4);
2102 static inline XMMReg4Double Load4Val(
const short *ptr)
2105 reg.low.nsLoad2Val(ptr);
2106 reg.high.nsLoad2Val(ptr + 2);
2110 static inline void Load8Val(
const short *ptr, XMMReg4Double &low,
2111 XMMReg4Double &high)
2113 low = Load4Val(ptr);
2114 high = Load4Val(ptr + 4);
2117 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
2120 reg.low.nsLoad2Val(ptr);
2121 reg.high.nsLoad2Val(ptr + 2);
2125 static inline void Load8Val(
const unsigned short *ptr, XMMReg4Double &low,
2126 XMMReg4Double &high)
2128 low = Load4Val(ptr);
2129 high = Load4Val(ptr + 4);
2132 static inline XMMReg4Double Load4Val(
const int *ptr)
2135 reg.low.nsLoad2Val(ptr);
2136 reg.high.nsLoad2Val(ptr + 2);
2140 static inline void Load8Val(
const int *ptr, XMMReg4Double &low,
2141 XMMReg4Double &high)
2143 low = Load4Val(ptr);
2144 high = Load4Val(ptr + 4);
2147 static inline XMMReg4Double Load4Val(
const double *ptr)
2150 reg.low.nsLoad2Val(ptr);
2151 reg.high.nsLoad2Val(ptr + 2);
2155 static inline void Load8Val(
const double *ptr, XMMReg4Double &low,
2156 XMMReg4Double &high)
2158 low = Load4Val(ptr);
2159 high = Load4Val(ptr + 4);
2162 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
2165 reg.low.nsLoad2ValAligned(ptr);
2166 reg.high.nsLoad2ValAligned(ptr + 2);
2170 static inline XMMReg4Double Load4Val(
const float *ptr)
2173 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
2177 static inline void Load8Val(
const float *ptr, XMMReg4Double &low,
2178 XMMReg4Double &high)
2180 low = Load4Val(ptr);
2181 high = Load4Val(ptr + 4);
2184 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
2185 const XMMReg4Double &expr2)
2188 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
2189 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
2193 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
2194 const XMMReg4Double &expr2)
2197 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
2198 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
2202 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
2203 const XMMReg4Double &expr2)
2206 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
2207 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
2211 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
2212 const XMMReg4Double &expr2)
2215 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
2216 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
2220 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
2221 const XMMReg4Double &true_expr,
2222 const XMMReg4Double &false_expr)
2226 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
2228 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
2232 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
2233 const XMMReg4Double &expr2)
2236 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
2237 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
2241 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
2248 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
2255 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
2262 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
2265 ret.low = low + other.low;
2266 ret.high = high + other.high;
2270 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
2273 ret.low = low - other.low;
2274 ret.high = high - other.high;
2278 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
2281 ret.low = low * other.low;
2282 ret.high = high * other.high;
2286 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
2289 ret.low = low / other.low;
2290 ret.high = high / other.high;
2294 void AddToLow(
const XMMReg2Double &other)
2299 inline double GetHorizSum()
const
2301 return (low + high).GetHorizSum();
2304#if !defined(USE_SSE2_EMULATION)
2305 inline XMMReg4Double approx_inv_sqrt(
const XMMReg4Double &one,
2306 const XMMReg4Double &half)
const
2308 __m128d reg0 = low.xmm;
2309 __m128d reg1 = high.xmm;
2310 __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);
2311 __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);
2313 reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));
2314 reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));
2318 const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);
2320 reg0, _mm_sub_pd(one_and_a_half,
2321 _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));
2323 reg1, _mm_sub_pd(one_and_a_half,
2324 _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));
2327 ret.high.xmm = reg1;
2331 inline XMMReg4Float cast_to_float()
const
2334 ret.xmm = _mm_castsi128_ps(
2335 _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),
2336 _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));
2341 inline void Store4Val(
unsigned char *ptr)
const
2343#ifdef USE_SSE2_EMULATION
2345 high.Store2Val(ptr + 2);
2347 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
2350 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
2353 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
2354 _mm_castsi128_ps(tmpHigh),
2355 _MM_SHUFFLE(1, 0, 1, 0)));
2356 tmp = _mm_packs_epi32(tmp, tmp);
2357 tmp = _mm_packus_epi16(tmp, tmp);
2358 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
2362 inline void Store4Val(
unsigned short *ptr)
const
2366 high.Store2Val(ptr + 2);
2368 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
2369 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
2370 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
2372 xmm0 = _mm_packus_epi32(xmm0, xmm0);
2374 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
2375 xmm0 = _mm_packs_epi32(xmm0, xmm0);
2376 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
2378 GDALCopyXMMToInt64(xmm0, (
GInt64 *)ptr);
2382 inline void Store4Val(
float *ptr)
const
2385 high.Store2Val(ptr + 2);
2388 inline void Store4Val(
double *ptr)
const
2391 high.Store2Val(ptr + 2);
2394 inline void StoreMask(
unsigned char *ptr)
const
2397 high.StoreMask(ptr + 16);
2401#if !defined(USE_SSE2_EMULATION)
2402inline XMMReg4Double XMMReg4Float::cast_to_double()
const
2405 ret.low.xmm = _mm_cvtps_pd(xmm);
2406 ret.high.xmm = _mm_cvtps_pd(
2407 _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));
2411inline XMMReg4Double XMMReg4Int::cast_to_double()
const
2414 ret.low.xmm = _mm_cvtepi32_pd(xmm);
2415 ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition cpl_port.h:161
GIntBig GInt64
Signed 64 bit integer type.
Definition cpl_port.h:216
unsigned short GUInt16
Unsigned int16 type.
Definition cpl_port.h:163
int GInt32
Int32 type.
Definition cpl_port.h:155