GDAL
gdalsse_priv.h
1/******************************************************************************
2 *
3 * Project: GDAL
4 * Purpose: SSE2 helper
5 * Author: Even Rouault <even dot rouault at spatialys dot com>
6 *
7 ******************************************************************************
8 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
9 *
10 * SPDX-License-Identifier: MIT
11 ****************************************************************************/
12
13#ifndef GDALSSE_PRIV_H_INCLUDED
14#define GDALSSE_PRIV_H_INCLUDED
15
16#ifndef DOXYGEN_SKIP
17
18#include "cpl_port.h"
19
20/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
21/* Could possibly be used too on 32bit, but we would need to check at runtime */
22#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
23 !defined(USE_SSE2_EMULATION)
24
25#include <string.h>
26
27#ifdef USE_NEON_OPTIMIZATIONS
28#include "include_sse2neon.h"
29#else
30/* Requires SSE2 */
31#include <emmintrin.h>
32
33#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
34#include <smmintrin.h>
35#endif
36#endif
37
38#include "gdal_priv_templates.hpp"
39
40static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
41{
42 unsigned short s;
43 memcpy(&s, ptr, 2);
44 return _mm_cvtsi32_si128(s);
45}
46
47static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
48{
49 GInt32 i;
50 memcpy(&i, ptr, 4);
51 return _mm_cvtsi32_si128(i);
52}
53
54static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
55{
56#if defined(__i386__) || defined(_M_IX86)
57 return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
58#else
59 GInt64 i;
60 memcpy(&i, ptr, 8);
61 return _mm_cvtsi64_si128(i);
62#endif
63}
64
65#ifndef GDALCopyXMMToInt16_defined
66#define GDALCopyXMMToInt16_defined
67
68static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
69{
70 GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
71 memcpy(pDest, &i, 2);
72}
73#endif
74
75class XMMReg4Int;
76
77class XMMReg4Double;
78
79class XMMReg4Float
80{
81 public:
82 __m128 xmm;
83
84 XMMReg4Float()
85#if !defined(_MSC_VER)
86 : xmm(_mm_undefined_ps())
87#endif
88 {
89 }
90
91 XMMReg4Float(const XMMReg4Float &other) : xmm(other.xmm)
92 {
93 }
94
95 static inline XMMReg4Float Zero()
96 {
97 XMMReg4Float reg;
98 reg.Zeroize();
99 return reg;
100 }
101
102 static inline XMMReg4Float Set1(float f)
103 {
104 XMMReg4Float reg;
105 reg.xmm = _mm_set1_ps(f);
106 return reg;
107 }
108
109 static inline XMMReg4Float LoadAllVal(const float *ptr)
110 {
111 return Load4Val(ptr);
112 }
113
114 static inline XMMReg4Float Load4Val(const float *ptr)
115 {
116 XMMReg4Float reg;
117 reg.nsLoad4Val(ptr);
118 return reg;
119 }
120
121 static inline XMMReg4Float Load4Val(const unsigned char *ptr)
122 {
123 XMMReg4Float reg;
124 reg.nsLoad4Val(ptr);
125 return reg;
126 }
127
128 static inline XMMReg4Float Load4Val(const short *ptr)
129 {
130 XMMReg4Float reg;
131 reg.nsLoad4Val(ptr);
132 return reg;
133 }
134
135 static inline XMMReg4Float Load4Val(const unsigned short *ptr)
136 {
137 XMMReg4Float reg;
138 reg.nsLoad4Val(ptr);
139 return reg;
140 }
141
142 static inline XMMReg4Float Load4Val(const int *ptr)
143 {
144 XMMReg4Float reg;
145 reg.nsLoad4Val(ptr);
146 return reg;
147 }
148
149 static inline XMMReg4Float Equals(const XMMReg4Float &expr1,
150 const XMMReg4Float &expr2)
151 {
152 XMMReg4Float reg;
153 reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);
154 return reg;
155 }
156
157 static inline XMMReg4Float NotEquals(const XMMReg4Float &expr1,
158 const XMMReg4Float &expr2)
159 {
160 XMMReg4Float reg;
161 reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);
162 return reg;
163 }
164
165 static inline XMMReg4Float Lesser(const XMMReg4Float &expr1,
166 const XMMReg4Float &expr2)
167 {
168 XMMReg4Float reg;
169 reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);
170 return reg;
171 }
172
173 static inline XMMReg4Float Greater(const XMMReg4Float &expr1,
174 const XMMReg4Float &expr2)
175 {
176 XMMReg4Float reg;
177 reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);
178 return reg;
179 }
180
181 static inline XMMReg4Float And(const XMMReg4Float &expr1,
182 const XMMReg4Float &expr2)
183 {
184 XMMReg4Float reg;
185 reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);
186 return reg;
187 }
188
189 static inline XMMReg4Float Ternary(const XMMReg4Float &cond,
190 const XMMReg4Float &true_expr,
191 const XMMReg4Float &false_expr)
192 {
193 XMMReg4Float reg;
194 reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),
195 _mm_andnot_ps(cond.xmm, false_expr.xmm));
196 return reg;
197 }
198
199 static inline XMMReg4Float Min(const XMMReg4Float &expr1,
200 const XMMReg4Float &expr2)
201 {
202 XMMReg4Float reg;
203 reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);
204 return reg;
205 }
206
207 static inline XMMReg4Float Max(const XMMReg4Float &expr1,
208 const XMMReg4Float &expr2)
209 {
210 XMMReg4Float reg;
211 reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);
212 return reg;
213 }
214
215 inline void nsLoad4Val(const float *ptr)
216 {
217 xmm = _mm_loadu_ps(ptr);
218 }
219
220 static inline void Load16Val(const float *ptr, XMMReg4Float &r0,
221 XMMReg4Float &r1, XMMReg4Float &r2,
222 XMMReg4Float &r3)
223 {
224 r0.nsLoad4Val(ptr);
225 r1.nsLoad4Val(ptr + 4);
226 r2.nsLoad4Val(ptr + 8);
227 r3.nsLoad4Val(ptr + 12);
228 }
229
230 inline void nsLoad4Val(const int *ptr)
231 {
232 const __m128i xmm_i =
233 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
234 xmm = _mm_cvtepi32_ps(xmm_i);
235 }
236
237 static inline void Load16Val(const int *ptr, XMMReg4Float &r0,
238 XMMReg4Float &r1, XMMReg4Float &r2,
239 XMMReg4Float &r3)
240 {
241 r0.nsLoad4Val(ptr);
242 r1.nsLoad4Val(ptr + 4);
243 r2.nsLoad4Val(ptr + 8);
244 r3.nsLoad4Val(ptr + 12);
245 }
246
247 static inline __m128i cvtepu8_epi32(__m128i x)
248 {
249#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
250 return _mm_cvtepu8_epi32(x);
251#else
252 return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),
253 _mm_setzero_si128());
254#endif
255 }
256
257 inline void nsLoad4Val(const unsigned char *ptr)
258 {
259 const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
260 xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
261 }
262
263 static inline void Load8Val(const unsigned char *ptr, XMMReg4Float &r0,
264 XMMReg4Float &r1)
265 {
266 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
267 r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
268 r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
269 }
270
271 static inline void Load16Val(const unsigned char *ptr, XMMReg4Float &r0,
272 XMMReg4Float &r1, XMMReg4Float &r2,
273 XMMReg4Float &r3)
274 {
275 const __m128i xmm_i =
276 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
277 r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
278 r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
279 r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));
280 r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));
281 }
282
283 static inline __m128i cvtepi16_epi32(__m128i x)
284 {
285#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
286 return _mm_cvtepi16_epi32(x);
287#else
288 /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
289 return _mm_srai_epi32(
290 /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
291 _mm_unpacklo_epi16(x, x), 16);
292#endif
293 }
294
295 inline void nsLoad4Val(const short *ptr)
296 {
297 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
298 xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
299 }
300
301 static inline void Load8Val(const short *ptr, XMMReg4Float &r0,
302 XMMReg4Float &r1)
303 {
304 const __m128i xmm_i =
305 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
306 r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
307 r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));
308 }
309
310 static inline void Load16Val(const short *ptr, XMMReg4Float &r0,
311 XMMReg4Float &r1, XMMReg4Float &r2,
312 XMMReg4Float &r3)
313 {
314 Load8Val(ptr, r0, r1);
315 Load8Val(ptr + 8, r2, r3);
316 }
317
318 static inline __m128i cvtepu16_epi32(__m128i x)
319 {
320#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
321 return _mm_cvtepu16_epi32(x);
322#else
323 return _mm_unpacklo_epi16(
324 x, _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
325#endif
326 }
327
328 inline void nsLoad4Val(const unsigned short *ptr)
329 {
330 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
331 xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
332 }
333
334 static inline void Load8Val(const unsigned short *ptr, XMMReg4Float &r0,
335 XMMReg4Float &r1)
336 {
337 const __m128i xmm_i =
338 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
339 r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
340 r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));
341 }
342
343 static inline void Load16Val(const unsigned short *ptr, XMMReg4Float &r0,
344 XMMReg4Float &r1, XMMReg4Float &r2,
345 XMMReg4Float &r3)
346 {
347 Load8Val(ptr, r0, r1);
348 Load8Val(ptr + 8, r2, r3);
349 }
350
351 inline void Zeroize()
352 {
353 xmm = _mm_setzero_ps();
354 }
355
356 inline XMMReg4Float &operator=(const XMMReg4Float &other)
357 {
358 xmm = other.xmm;
359 return *this;
360 }
361
362 inline XMMReg4Float &operator+=(const XMMReg4Float &other)
363 {
364 xmm = _mm_add_ps(xmm, other.xmm);
365 return *this;
366 }
367
368 inline XMMReg4Float &operator-=(const XMMReg4Float &other)
369 {
370 xmm = _mm_sub_ps(xmm, other.xmm);
371 return *this;
372 }
373
374 inline XMMReg4Float &operator*=(const XMMReg4Float &other)
375 {
376 xmm = _mm_mul_ps(xmm, other.xmm);
377 return *this;
378 }
379
380 inline XMMReg4Float operator+(const XMMReg4Float &other) const
381 {
382 XMMReg4Float ret;
383 ret.xmm = _mm_add_ps(xmm, other.xmm);
384 return ret;
385 }
386
387 inline XMMReg4Float operator-(const XMMReg4Float &other) const
388 {
389 XMMReg4Float ret;
390 ret.xmm = _mm_sub_ps(xmm, other.xmm);
391 return ret;
392 }
393
394 inline XMMReg4Float operator*(const XMMReg4Float &other) const
395 {
396 XMMReg4Float ret;
397 ret.xmm = _mm_mul_ps(xmm, other.xmm);
398 return ret;
399 }
400
401 inline XMMReg4Float operator/(const XMMReg4Float &other) const
402 {
403 XMMReg4Float ret;
404 ret.xmm = _mm_div_ps(xmm, other.xmm);
405 return ret;
406 }
407
408 inline XMMReg4Float inverse() const
409 {
410 XMMReg4Float ret;
411 ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);
412 return ret;
413 }
414
415 inline XMMReg4Int truncate_to_int() const;
416
417 inline XMMReg4Float cast_to_float() const
418 {
419 return *this;
420 }
421
422 inline XMMReg4Double cast_to_double() const;
423
424 inline XMMReg4Float approx_inv_sqrt(const XMMReg4Float &one,
425 const XMMReg4Float &half) const
426 {
427 __m128 reg = xmm;
428 __m128 reg_half = _mm_mul_ps(reg, half.xmm);
429 // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
430 reg = _mm_rsqrt_ps(reg);
431 // And perform one step of Newton-Raphson approximation to improve it
432 // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
433 // 0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
434 const __m128 one_and_a_half = _mm_add_ps(one.xmm, half.xmm);
435 reg = _mm_mul_ps(
436 reg, _mm_sub_ps(one_and_a_half,
437 _mm_mul_ps(reg_half, _mm_mul_ps(reg, reg))));
438 XMMReg4Float ret;
439 ret.xmm = reg;
440 return ret;
441 }
442
443 inline void StoreAllVal(float *ptr) const
444 {
445 Store4Val(ptr);
446 }
447
448 inline void Store4Val(float *ptr) const
449 {
450 _mm_storeu_ps(ptr, xmm);
451 }
452
453 inline void Store4ValAligned(float *ptr) const
454 {
455 _mm_store_ps(ptr, xmm);
456 }
457
458 inline operator float() const
459 {
460 return _mm_cvtss_f32(xmm);
461 }
462};
463
464class XMMReg4Int
465{
466 public:
467 __m128i xmm;
468
469 XMMReg4Int()
470#if !defined(_MSC_VER)
471 : xmm(_mm_undefined_si128())
472#endif
473 {
474 }
475
476 XMMReg4Int(const XMMReg4Int &other) : xmm(other.xmm)
477 {
478 }
479
480 inline XMMReg4Int &operator=(const XMMReg4Int &other)
481 {
482 xmm = other.xmm;
483 return *this;
484 }
485
486 static inline XMMReg4Int Zero()
487 {
488 XMMReg4Int reg;
489 reg.xmm = _mm_setzero_si128();
490 return reg;
491 }
492
493 static inline XMMReg4Int Set1(int i)
494 {
495 XMMReg4Int reg;
496 reg.xmm = _mm_set1_epi32(i);
497 return reg;
498 }
499
500 static inline XMMReg4Int LoadAllVal(const int *ptr)
501 {
502 return Load4Val(ptr);
503 }
504
505 static inline XMMReg4Int Load4Val(const int *ptr)
506 {
507 XMMReg4Int reg;
508 reg.nsLoad4Val(ptr);
509 return reg;
510 }
511
512 inline void nsLoad4Val(const int *ptr)
513 {
514 xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
515 }
516
517 static inline XMMReg4Int Equals(const XMMReg4Int &expr1,
518 const XMMReg4Int &expr2)
519 {
520 XMMReg4Int reg;
521 reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);
522 return reg;
523 }
524
525 static inline XMMReg4Int Ternary(const XMMReg4Int &cond,
526 const XMMReg4Int &true_expr,
527 const XMMReg4Int &false_expr)
528 {
529 XMMReg4Int reg;
530 reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
531 _mm_andnot_si128(cond.xmm, false_expr.xmm));
532 return reg;
533 }
534
535 inline XMMReg4Int &operator+=(const XMMReg4Int &other)
536 {
537 xmm = _mm_add_epi32(xmm, other.xmm);
538 return *this;
539 }
540
541 inline XMMReg4Int &operator-=(const XMMReg4Int &other)
542 {
543 xmm = _mm_sub_epi32(xmm, other.xmm);
544 return *this;
545 }
546
547 inline XMMReg4Int operator+(const XMMReg4Int &other) const
548 {
549 XMMReg4Int ret;
550 ret.xmm = _mm_add_epi32(xmm, other.xmm);
551 return ret;
552 }
553
554 inline XMMReg4Int operator-(const XMMReg4Int &other) const
555 {
556 XMMReg4Int ret;
557 ret.xmm = _mm_sub_epi32(xmm, other.xmm);
558 return ret;
559 }
560
561 XMMReg4Double cast_to_double() const;
562
563 XMMReg4Float cast_to_float() const
564 {
565 XMMReg4Float ret;
566 ret.xmm = _mm_cvtepi32_ps(xmm);
567 return ret;
568 }
569};
570
571inline XMMReg4Int XMMReg4Float::truncate_to_int() const
572{
573 XMMReg4Int ret;
574 ret.xmm = _mm_cvttps_epi32(xmm);
575 return ret;
576}
577
578class XMMReg8Byte
579{
580 public:
581 __m128i xmm;
582
583 XMMReg8Byte()
584#if !defined(_MSC_VER)
585 : xmm(_mm_undefined_si128())
586#endif
587 {
588 }
589
590 XMMReg8Byte(const XMMReg8Byte &other) : xmm(other.xmm)
591 {
592 }
593
594 static inline XMMReg8Byte Zero()
595 {
596 XMMReg8Byte reg;
597 reg.xmm = _mm_setzero_si128();
598 return reg;
599 }
600
601 static inline XMMReg8Byte Set1(char i)
602 {
603 XMMReg8Byte reg;
604 reg.xmm = _mm_set1_epi8(i);
605 return reg;
606 }
607
608 static inline XMMReg8Byte Equals(const XMMReg8Byte &expr1,
609 const XMMReg8Byte &expr2)
610 {
611 XMMReg8Byte reg;
612 reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);
613 return reg;
614 }
615
616 static inline XMMReg8Byte Or(const XMMReg8Byte &expr1,
617 const XMMReg8Byte &expr2)
618 {
619 XMMReg8Byte reg;
620 reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);
621 return reg;
622 }
623
624 static inline XMMReg8Byte Ternary(const XMMReg8Byte &cond,
625 const XMMReg8Byte &true_expr,
626 const XMMReg8Byte &false_expr)
627 {
628 XMMReg8Byte reg;
629 reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
630 _mm_andnot_si128(cond.xmm, false_expr.xmm));
631 return reg;
632 }
633
634 inline XMMReg8Byte operator+(const XMMReg8Byte &other) const
635 {
636 XMMReg8Byte ret;
637 ret.xmm = _mm_add_epi8(xmm, other.xmm);
638 return ret;
639 }
640
641 inline XMMReg8Byte operator-(const XMMReg8Byte &other) const
642 {
643 XMMReg8Byte ret;
644 ret.xmm = _mm_sub_epi8(xmm, other.xmm);
645 return ret;
646 }
647
648 static inline XMMReg8Byte Pack(const XMMReg4Int &r0, const XMMReg4Int &r1)
649 {
650 XMMReg8Byte reg;
651 reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);
652 reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);
653 return reg;
654 }
655
656 inline void Store8Val(unsigned char *ptr) const
657 {
658 GDALCopyXMMToInt64(xmm, reinterpret_cast<GInt64 *>(ptr));
659 }
660};
661
662class XMMReg2Double
663{
664 public:
665 __m128d xmm;
666
667 XMMReg2Double()
668#if !defined(_MSC_VER)
669 : xmm(_mm_undefined_pd())
670#endif
671 {
672 }
673
674 XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
675 {
676 }
677
678 XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
679 {
680 }
681
682 static inline XMMReg2Double Set1(double d)
683 {
684 XMMReg2Double reg;
685 reg.xmm = _mm_set1_pd(d);
686 return reg;
687 }
688
689 static inline XMMReg2Double Zero()
690 {
691 XMMReg2Double reg;
692 reg.Zeroize();
693 return reg;
694 }
695
696 static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
697 {
698 XMMReg2Double reg;
699 reg.nsLoad1ValHighAndLow(ptr);
700 return reg;
701 }
702
703 static inline XMMReg2Double Load2Val(const double *ptr)
704 {
705 XMMReg2Double reg;
706 reg.nsLoad2Val(ptr);
707 return reg;
708 }
709
710 static inline XMMReg2Double Load2Val(const float *ptr)
711 {
712 XMMReg2Double reg;
713 reg.nsLoad2Val(ptr);
714 return reg;
715 }
716
717 static inline XMMReg2Double Load2ValAligned(const double *ptr)
718 {
719 XMMReg2Double reg;
720 reg.nsLoad2ValAligned(ptr);
721 return reg;
722 }
723
724 static inline XMMReg2Double Load2Val(const unsigned char *ptr)
725 {
726 XMMReg2Double reg;
727 reg.nsLoad2Val(ptr);
728 return reg;
729 }
730
731 static inline XMMReg2Double Load2Val(const short *ptr)
732 {
733 XMMReg2Double reg;
734 reg.nsLoad2Val(ptr);
735 return reg;
736 }
737
738 static inline XMMReg2Double Load2Val(const unsigned short *ptr)
739 {
740 XMMReg2Double reg;
741 reg.nsLoad2Val(ptr);
742 return reg;
743 }
744
745 static inline XMMReg2Double Load2Val(const int *ptr)
746 {
747 XMMReg2Double reg;
748 reg.nsLoad2Val(ptr);
749 return reg;
750 }
751
752 static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
753 const XMMReg2Double &expr2)
754 {
755 XMMReg2Double reg;
756 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
757 return reg;
758 }
759
760 static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
761 const XMMReg2Double &expr2)
762 {
763 XMMReg2Double reg;
764 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
765 return reg;
766 }
767
768 static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
769 const XMMReg2Double &expr2)
770 {
771 XMMReg2Double reg;
772 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
773 return reg;
774 }
775
776 static inline XMMReg2Double And(const XMMReg2Double &expr1,
777 const XMMReg2Double &expr2)
778 {
779 XMMReg2Double reg;
780 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
781 return reg;
782 }
783
784 static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
785 const XMMReg2Double &true_expr,
786 const XMMReg2Double &false_expr)
787 {
788 XMMReg2Double reg;
789 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
790 _mm_andnot_pd(cond.xmm, false_expr.xmm));
791 return reg;
792 }
793
794 static inline XMMReg2Double Min(const XMMReg2Double &expr1,
795 const XMMReg2Double &expr2)
796 {
797 XMMReg2Double reg;
798 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
799 return reg;
800 }
801
802 inline void nsLoad1ValHighAndLow(const double *ptr)
803 {
804 xmm = _mm_load1_pd(ptr);
805 }
806
807 inline void nsLoad2Val(const double *ptr)
808 {
809 xmm = _mm_loadu_pd(ptr);
810 }
811
812 inline void nsLoad2ValAligned(const double *ptr)
813 {
814 xmm = _mm_load_pd(ptr);
815 }
816
817 inline void nsLoad2Val(const float *ptr)
818 {
819 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
820 }
821
822 inline void nsLoad2Val(const int *ptr)
823 {
824 xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));
825 }
826
827 inline void nsLoad2Val(const unsigned char *ptr)
828 {
829 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
830#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
831 xmm_i = _mm_cvtepu8_epi32(xmm_i);
832#else
833 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
834 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
835#endif
836 xmm = _mm_cvtepi32_pd(xmm_i);
837 }
838
839 inline void nsLoad2Val(const short *ptr)
840 {
841 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
842#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
843 xmm_i = _mm_cvtepi16_epi32(xmm_i);
844#else
845 xmm_i = _mm_unpacklo_epi16(
846 xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
847 xmm_i = _mm_srai_epi32(
848 xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
849#endif
850 xmm = _mm_cvtepi32_pd(xmm_i);
851 }
852
853 inline void nsLoad2Val(const unsigned short *ptr)
854 {
855 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
856#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
857 xmm_i = _mm_cvtepu16_epi32(xmm_i);
858#else
859 xmm_i = _mm_unpacklo_epi16(
860 xmm_i,
861 _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
862#endif
863 xmm = _mm_cvtepi32_pd(xmm_i);
864 }
865
866 static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
867 XMMReg2Double &high)
868 {
869 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
870#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
871 xmm_i = _mm_cvtepu8_epi32(xmm_i);
872#else
873 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
874 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
875#endif
876 low.xmm = _mm_cvtepi32_pd(xmm_i);
877 high.xmm =
878 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
879 }
880
881 static inline void Load4Val(const short *ptr, XMMReg2Double &low,
882 XMMReg2Double &high)
883 {
884 low.nsLoad2Val(ptr);
885 high.nsLoad2Val(ptr + 2);
886 }
887
888 static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
889 XMMReg2Double &high)
890 {
891 low.nsLoad2Val(ptr);
892 high.nsLoad2Val(ptr + 2);
893 }
894
895 static inline void Load4Val(const double *ptr, XMMReg2Double &low,
896 XMMReg2Double &high)
897 {
898 low.nsLoad2Val(ptr);
899 high.nsLoad2Val(ptr + 2);
900 }
901
902 static inline void Load4Val(const float *ptr, XMMReg2Double &low,
903 XMMReg2Double &high)
904 {
905 __m128 temp1 = _mm_loadu_ps(ptr);
906 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
907 low.xmm = _mm_cvtps_pd(temp1);
908 high.xmm = _mm_cvtps_pd(temp2);
909 }
910
911 inline void Zeroize()
912 {
913 xmm = _mm_setzero_pd();
914 }
915
916 inline XMMReg2Double &operator=(const XMMReg2Double &other)
917 {
918 xmm = other.xmm;
919 return *this;
920 }
921
922 inline XMMReg2Double &operator+=(const XMMReg2Double &other)
923 {
924 xmm = _mm_add_pd(xmm, other.xmm);
925 return *this;
926 }
927
928 inline XMMReg2Double &operator*=(const XMMReg2Double &other)
929 {
930 xmm = _mm_mul_pd(xmm, other.xmm);
931 return *this;
932 }
933
934 inline XMMReg2Double operator+(const XMMReg2Double &other) const
935 {
936 XMMReg2Double ret;
937 ret.xmm = _mm_add_pd(xmm, other.xmm);
938 return ret;
939 }
940
941 inline XMMReg2Double operator-(const XMMReg2Double &other) const
942 {
943 XMMReg2Double ret;
944 ret.xmm = _mm_sub_pd(xmm, other.xmm);
945 return ret;
946 }
947
948 inline XMMReg2Double operator*(const XMMReg2Double &other) const
949 {
950 XMMReg2Double ret;
951 ret.xmm = _mm_mul_pd(xmm, other.xmm);
952 return ret;
953 }
954
955 inline XMMReg2Double operator/(const XMMReg2Double &other) const
956 {
957 XMMReg2Double ret;
958 ret.xmm = _mm_div_pd(xmm, other.xmm);
959 return ret;
960 }
961
962 inline double GetHorizSum() const
963 {
964 __m128d xmm2;
965 xmm2 = _mm_shuffle_pd(
966 xmm, xmm,
967 _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
968 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
969 }
970
971 inline void Store2Val(double *ptr) const
972 {
973 _mm_storeu_pd(ptr, xmm);
974 }
975
976 inline void Store2ValAligned(double *ptr) const
977 {
978 _mm_store_pd(ptr, xmm);
979 }
980
981 inline void Store2Val(float *ptr) const
982 {
983 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
984 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
985 }
986
987 inline void Store2Val(unsigned char *ptr) const
988 {
989 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
990 xmm,
991 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
992 tmp = _mm_packs_epi32(tmp, tmp);
993 tmp = _mm_packus_epi16(tmp, tmp);
994 GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
995 }
996
997 inline void Store2Val(unsigned short *ptr) const
998 {
999 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
1000 xmm,
1001 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1002 // X X X X 0 B 0 A --> X X X X A A B A
1003 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
1004 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
1005 }
1006
1007 inline void StoreMask(unsigned char *ptr) const
1008 {
1009 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
1010 _mm_castpd_si128(xmm));
1011 }
1012
1013 inline operator double() const
1014 {
1015 return _mm_cvtsd_f64(xmm);
1016 }
1017};
1018
1019#else
1020
1021#ifndef NO_WARN_USE_SSE2_EMULATION
1022#warning "Software emulation of SSE2 !"
1023#endif
1024
1025class XMMReg2Double
1026{
1027 public:
1028 double low;
1029 double high;
1030
1031 // cppcheck-suppress uninitMemberVar
1032 XMMReg2Double() = default;
1033
1034 explicit XMMReg2Double(double val)
1035 {
1036 low = val;
1037 high = 0.0;
1038 }
1039
1040 XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
1041 {
1042 }
1043
1044 static inline XMMReg2Double Zero()
1045 {
1046 XMMReg2Double reg;
1047 reg.Zeroize();
1048 return reg;
1049 }
1050
1051 static inline XMMReg2Double Set1(double d)
1052 {
1053 XMMReg2Double reg;
1054 reg.low = d;
1055 reg.high = d;
1056 return reg;
1057 }
1058
1059 static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
1060 {
1061 XMMReg2Double reg;
1062 reg.nsLoad1ValHighAndLow(ptr);
1063 return reg;
1064 }
1065
1066 static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
1067 const XMMReg2Double &expr2)
1068 {
1069 XMMReg2Double reg;
1070
1071 if (expr1.low == expr2.low)
1072 memset(&(reg.low), 0xFF, sizeof(double));
1073 else
1074 reg.low = 0;
1075
1076 if (expr1.high == expr2.high)
1077 memset(&(reg.high), 0xFF, sizeof(double));
1078 else
1079 reg.high = 0;
1080
1081 return reg;
1082 }
1083
1084 static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
1085 const XMMReg2Double &expr2)
1086 {
1087 XMMReg2Double reg;
1088
1089 if (expr1.low != expr2.low)
1090 memset(&(reg.low), 0xFF, sizeof(double));
1091 else
1092 reg.low = 0;
1093
1094 if (expr1.high != expr2.high)
1095 memset(&(reg.high), 0xFF, sizeof(double));
1096 else
1097 reg.high = 0;
1098
1099 return reg;
1100 }
1101
1102 static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
1103 const XMMReg2Double &expr2)
1104 {
1105 XMMReg2Double reg;
1106
1107 if (expr1.low > expr2.low)
1108 memset(&(reg.low), 0xFF, sizeof(double));
1109 else
1110 reg.low = 0;
1111
1112 if (expr1.high > expr2.high)
1113 memset(&(reg.high), 0xFF, sizeof(double));
1114 else
1115 reg.high = 0;
1116
1117 return reg;
1118 }
1119
1120 static inline XMMReg2Double And(const XMMReg2Double &expr1,
1121 const XMMReg2Double &expr2)
1122 {
1123 XMMReg2Double reg;
1124 int low1[2], high1[2];
1125 int low2[2], high2[2];
1126 memcpy(low1, &expr1.low, sizeof(double));
1127 memcpy(high1, &expr1.high, sizeof(double));
1128 memcpy(low2, &expr2.low, sizeof(double));
1129 memcpy(high2, &expr2.high, sizeof(double));
1130 low1[0] &= low2[0];
1131 low1[1] &= low2[1];
1132 high1[0] &= high2[0];
1133 high1[1] &= high2[1];
1134 memcpy(&reg.low, low1, sizeof(double));
1135 memcpy(&reg.high, high1, sizeof(double));
1136 return reg;
1137 }
1138
1139 static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
1140 const XMMReg2Double &true_expr,
1141 const XMMReg2Double &false_expr)
1142 {
1143 XMMReg2Double reg;
1144 if (cond.low != 0)
1145 reg.low = true_expr.low;
1146 else
1147 reg.low = false_expr.low;
1148 if (cond.high != 0)
1149 reg.high = true_expr.high;
1150 else
1151 reg.high = false_expr.high;
1152 return reg;
1153 }
1154
1155 static inline XMMReg2Double Min(const XMMReg2Double &expr1,
1156 const XMMReg2Double &expr2)
1157 {
1158 XMMReg2Double reg;
1159 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
1160 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
1161 return reg;
1162 }
1163
1164 static inline XMMReg2Double Load2Val(const double *ptr)
1165 {
1166 XMMReg2Double reg;
1167 reg.nsLoad2Val(ptr);
1168 return reg;
1169 }
1170
1171 static inline XMMReg2Double Load2ValAligned(const double *ptr)
1172 {
1173 XMMReg2Double reg;
1174 reg.nsLoad2ValAligned(ptr);
1175 return reg;
1176 }
1177
1178 static inline XMMReg2Double Load2Val(const float *ptr)
1179 {
1180 XMMReg2Double reg;
1181 reg.nsLoad2Val(ptr);
1182 return reg;
1183 }
1184
1185 static inline XMMReg2Double Load2Val(const unsigned char *ptr)
1186 {
1187 XMMReg2Double reg;
1188 reg.nsLoad2Val(ptr);
1189 return reg;
1190 }
1191
1192 static inline XMMReg2Double Load2Val(const short *ptr)
1193 {
1194 XMMReg2Double reg;
1195 reg.nsLoad2Val(ptr);
1196 return reg;
1197 }
1198
1199 static inline XMMReg2Double Load2Val(const unsigned short *ptr)
1200 {
1201 XMMReg2Double reg;
1202 reg.nsLoad2Val(ptr);
1203 return reg;
1204 }
1205
1206 static inline XMMReg2Double Load2Val(const int *ptr)
1207 {
1208 XMMReg2Double reg;
1209 reg.nsLoad2Val(ptr);
1210 return reg;
1211 }
1212
1213 inline void nsLoad1ValHighAndLow(const double *ptr)
1214 {
1215 low = ptr[0];
1216 high = ptr[0];
1217 }
1218
1219 inline void nsLoad2Val(const double *ptr)
1220 {
1221 low = ptr[0];
1222 high = ptr[1];
1223 }
1224
1225 inline void nsLoad2ValAligned(const double *ptr)
1226 {
1227 low = ptr[0];
1228 high = ptr[1];
1229 }
1230
1231 inline void nsLoad2Val(const float *ptr)
1232 {
1233 low = ptr[0];
1234 high = ptr[1];
1235 }
1236
1237 inline void nsLoad2Val(const unsigned char *ptr)
1238 {
1239 low = ptr[0];
1240 high = ptr[1];
1241 }
1242
1243 inline void nsLoad2Val(const short *ptr)
1244 {
1245 low = ptr[0];
1246 high = ptr[1];
1247 }
1248
1249 inline void nsLoad2Val(const unsigned short *ptr)
1250 {
1251 low = ptr[0];
1252 high = ptr[1];
1253 }
1254
1255 inline void nsLoad2Val(const int *ptr)
1256 {
1257 low = ptr[0];
1258 high = ptr[1];
1259 }
1260
1261 static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
1262 XMMReg2Double &high)
1263 {
1264 low.low = ptr[0];
1265 low.high = ptr[1];
1266 high.low = ptr[2];
1267 high.high = ptr[3];
1268 }
1269
1270 static inline void Load4Val(const short *ptr, XMMReg2Double &low,
1271 XMMReg2Double &high)
1272 {
1273 low.nsLoad2Val(ptr);
1274 high.nsLoad2Val(ptr + 2);
1275 }
1276
1277 static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
1278 XMMReg2Double &high)
1279 {
1280 low.nsLoad2Val(ptr);
1281 high.nsLoad2Val(ptr + 2);
1282 }
1283
1284 static inline void Load4Val(const double *ptr, XMMReg2Double &low,
1285 XMMReg2Double &high)
1286 {
1287 low.nsLoad2Val(ptr);
1288 high.nsLoad2Val(ptr + 2);
1289 }
1290
1291 static inline void Load4Val(const float *ptr, XMMReg2Double &low,
1292 XMMReg2Double &high)
1293 {
1294 low.nsLoad2Val(ptr);
1295 high.nsLoad2Val(ptr + 2);
1296 }
1297
1298 inline void Zeroize()
1299 {
1300 low = 0.0;
1301 high = 0.0;
1302 }
1303
1304 inline XMMReg2Double &operator=(const XMMReg2Double &other)
1305 {
1306 low = other.low;
1307 high = other.high;
1308 return *this;
1309 }
1310
1311 inline XMMReg2Double &operator+=(const XMMReg2Double &other)
1312 {
1313 low += other.low;
1314 high += other.high;
1315 return *this;
1316 }
1317
1318 inline XMMReg2Double &operator*=(const XMMReg2Double &other)
1319 {
1320 low *= other.low;
1321 high *= other.high;
1322 return *this;
1323 }
1324
1325 inline XMMReg2Double operator+(const XMMReg2Double &other) const
1326 {
1327 XMMReg2Double ret;
1328 ret.low = low + other.low;
1329 ret.high = high + other.high;
1330 return ret;
1331 }
1332
1333 inline XMMReg2Double operator-(const XMMReg2Double &other) const
1334 {
1335 XMMReg2Double ret;
1336 ret.low = low - other.low;
1337 ret.high = high - other.high;
1338 return ret;
1339 }
1340
1341 inline XMMReg2Double operator*(const XMMReg2Double &other) const
1342 {
1343 XMMReg2Double ret;
1344 ret.low = low * other.low;
1345 ret.high = high * other.high;
1346 return ret;
1347 }
1348
1349 inline XMMReg2Double operator/(const XMMReg2Double &other) const
1350 {
1351 XMMReg2Double ret;
1352 ret.low = low / other.low;
1353 ret.high = high / other.high;
1354 return ret;
1355 }
1356
1357 inline double GetHorizSum() const
1358 {
1359 return low + high;
1360 }
1361
1362 inline void Store2Val(double *ptr) const
1363 {
1364 ptr[0] = low;
1365 ptr[1] = high;
1366 }
1367
1368 inline void Store2ValAligned(double *ptr) const
1369 {
1370 ptr[0] = low;
1371 ptr[1] = high;
1372 }
1373
1374 inline void Store2Val(float *ptr) const
1375 {
1376 ptr[0] = static_cast<float>(low);
1377 ptr[1] = static_cast<float>(high);
1378 }
1379
1380 void Store2Val(unsigned char *ptr) const
1381 {
1382 ptr[0] = static_cast<unsigned char>(low + 0.5);
1383 ptr[1] = static_cast<unsigned char>(high + 0.5);
1384 }
1385
1386 void Store2Val(unsigned short *ptr) const
1387 {
1388 ptr[0] = static_cast<GUInt16>(low + 0.5);
1389 ptr[1] = static_cast<GUInt16>(high + 0.5);
1390 }
1391
1392 inline void StoreMask(unsigned char *ptr) const
1393 {
1394 memcpy(ptr, &low, 8);
1395 memcpy(ptr + 8, &high, 8);
1396 }
1397
1398 inline operator double() const
1399 {
1400 return low;
1401 }
1402};
1403
1404#endif /* defined(__x86_64) || defined(_M_X64) */
1405
1406#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
1407
1408#include <immintrin.h>
1409
1410class XMMReg4Double
1411{
1412 public:
1413 __m256d ymm;
1414
1415 XMMReg4Double() : ymm(_mm256_setzero_pd())
1416 {
1417 }
1418
1419 XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
1420 {
1421 }
1422
1423 static inline XMMReg4Double Zero()
1424 {
1425 XMMReg4Double reg;
1426 reg.Zeroize();
1427 return reg;
1428 }
1429
1430 static inline XMMReg4Double Set1(double d)
1431 {
1432 XMMReg4Double reg;
1433 reg.ymm = _mm256_set1_pd(d);
1434 return reg;
1435 }
1436
1437 inline void Zeroize()
1438 {
1439 ymm = _mm256_setzero_pd();
1440 }
1441
1442 static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1443 {
1444 XMMReg4Double reg;
1445 reg.nsLoad1ValHighAndLow(ptr);
1446 return reg;
1447 }
1448
1449 inline void nsLoad1ValHighAndLow(const double *ptr)
1450 {
1451 ymm = _mm256_set1_pd(*ptr);
1452 }
1453
1454 static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1455 {
1456 XMMReg4Double reg;
1457 reg.nsLoad4Val(ptr);
1458 return reg;
1459 }
1460
1461 inline void nsLoad4Val(const unsigned char *ptr)
1462 {
1463 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
1464 xmm_i = _mm_cvtepu8_epi32(xmm_i);
1465 ymm = _mm256_cvtepi32_pd(xmm_i);
1466 }
1467
1468 static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
1469 XMMReg4Double &high)
1470 {
1471 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1472 const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);
1473 low.ymm = _mm256_cvtepi32_pd(xmm_i_low);
1474 const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));
1475 high.ymm = _mm256_cvtepi32_pd(xmm_i_high);
1476 }
1477
1478 static inline XMMReg4Double Load4Val(const short *ptr)
1479 {
1480 XMMReg4Double reg;
1481 reg.nsLoad4Val(ptr);
1482 return reg;
1483 }
1484
1485 inline void nsLoad4Val(const short *ptr)
1486 {
1487 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1488 xmm_i = _mm_cvtepi16_epi32(xmm_i);
1489 ymm = _mm256_cvtepi32_pd(xmm_i);
1490 }
1491
1492 static inline void Load8Val(const short *ptr, XMMReg4Double &low,
1493 XMMReg4Double &high)
1494 {
1495 low.nsLoad4Val(ptr);
1496 high.nsLoad4Val(ptr + 4);
1497 }
1498
1499 static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1500 {
1501 XMMReg4Double reg;
1502 reg.nsLoad4Val(ptr);
1503 return reg;
1504 }
1505
1506 inline void nsLoad4Val(const unsigned short *ptr)
1507 {
1508 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1509 xmm_i = _mm_cvtepu16_epi32(xmm_i);
1510 ymm = _mm256_cvtepi32_pd(
1511 xmm_i); // ok to use signed conversion since we are in the ushort
1512 // range, so cannot be interpreted as negative int32
1513 }
1514
1515 static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
1516 XMMReg4Double &high)
1517 {
1518 low.nsLoad4Val(ptr);
1519 high.nsLoad4Val(ptr + 4);
1520 }
1521
1522 static inline XMMReg4Double Load4Val(const double *ptr)
1523 {
1524 XMMReg4Double reg;
1525 reg.nsLoad4Val(ptr);
1526 return reg;
1527 }
1528
1529 inline void nsLoad4Val(const double *ptr)
1530 {
1531 ymm = _mm256_loadu_pd(ptr);
1532 }
1533
1534 static inline void Load8Val(const double *ptr, XMMReg4Double &low,
1535 XMMReg4Double &high)
1536 {
1537 low.nsLoad4Val(ptr);
1538 high.nsLoad4Val(ptr + 4);
1539 }
1540
1541 static inline XMMReg4Double Load4ValAligned(const double *ptr)
1542 {
1543 XMMReg4Double reg;
1544 reg.nsLoad4ValAligned(ptr);
1545 return reg;
1546 }
1547
1548 inline void nsLoad4ValAligned(const double *ptr)
1549 {
1550 ymm = _mm256_load_pd(ptr);
1551 }
1552
1553 static inline XMMReg4Double Load4Val(const float *ptr)
1554 {
1555 XMMReg4Double reg;
1556 reg.nsLoad4Val(ptr);
1557 return reg;
1558 }
1559
1560 inline void nsLoad4Val(const float *ptr)
1561 {
1562 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
1563 }
1564
1565 static inline void Load8Val(const float *ptr, XMMReg4Double &low,
1566 XMMReg4Double &high)
1567 {
1568 low.nsLoad4Val(ptr);
1569 high.nsLoad4Val(ptr + 4);
1570 }
1571
1572 static inline XMMReg4Double Load4Val(const int *ptr)
1573 {
1574 XMMReg4Double reg;
1575 reg.nsLoad4Val(ptr);
1576 return reg;
1577 }
1578
1579 inline void nsLoad4Val(const int *ptr)
1580 {
1581 ymm = _mm256_cvtepi32_pd(
1582 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)));
1583 }
1584
1585 static inline void Load8Val(const int *ptr, XMMReg4Double &low,
1586 XMMReg4Double &high)
1587 {
1588 low.nsLoad4Val(ptr);
1589 high.nsLoad4Val(ptr + 4);
1590 }
1591
1592 static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1593 const XMMReg4Double &expr2)
1594 {
1595 XMMReg4Double reg;
1596 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
1597 return reg;
1598 }
1599
1600 static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1601 const XMMReg4Double &expr2)
1602 {
1603 XMMReg4Double reg;
1604 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
1605 return reg;
1606 }
1607
1608 static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1609 const XMMReg4Double &expr2)
1610 {
1611 XMMReg4Double reg;
1612 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
1613 return reg;
1614 }
1615
1616 static inline XMMReg4Double And(const XMMReg4Double &expr1,
1617 const XMMReg4Double &expr2)
1618 {
1619 XMMReg4Double reg;
1620 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
1621 return reg;
1622 }
1623
1624 static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1625 const XMMReg4Double &true_expr,
1626 const XMMReg4Double &false_expr)
1627 {
1628 XMMReg4Double reg;
1629 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
1630 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
1631 return reg;
1632 }
1633
1634 static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1635 const XMMReg4Double &expr2)
1636 {
1637 XMMReg4Double reg;
1638 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
1639 return reg;
1640 }
1641
1642 inline XMMReg4Double &operator=(const XMMReg4Double &other)
1643 {
1644 ymm = other.ymm;
1645 return *this;
1646 }
1647
1648 inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1649 {
1650 ymm = _mm256_add_pd(ymm, other.ymm);
1651 return *this;
1652 }
1653
1654 inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1655 {
1656 ymm = _mm256_mul_pd(ymm, other.ymm);
1657 return *this;
1658 }
1659
1660 inline XMMReg4Double operator+(const XMMReg4Double &other) const
1661 {
1662 XMMReg4Double ret;
1663 ret.ymm = _mm256_add_pd(ymm, other.ymm);
1664 return ret;
1665 }
1666
1667 inline XMMReg4Double operator-(const XMMReg4Double &other) const
1668 {
1669 XMMReg4Double ret;
1670 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1671 return ret;
1672 }
1673
1674 inline XMMReg4Double operator*(const XMMReg4Double &other) const
1675 {
1676 XMMReg4Double ret;
1677 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1678 return ret;
1679 }
1680
1681 inline XMMReg4Double operator/(const XMMReg4Double &other) const
1682 {
1683 XMMReg4Double ret;
1684 ret.ymm = _mm256_div_pd(ymm, other.ymm);
1685 return ret;
1686 }
1687
1688 void AddToLow(const XMMReg2Double &other)
1689 {
1690 __m256d ymm2 = _mm256_setzero_pd();
1691 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1692 ymm = _mm256_add_pd(ymm, ymm2);
1693 }
1694
1695 inline double GetHorizSum() const
1696 {
1697 __m256d ymm_tmp1, ymm_tmp2;
1698 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1699 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1700 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1701 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1702 }
1703
1704 inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
1705 const XMMReg4Double &half) const
1706 {
1707 __m256d reg = ymm;
1708 __m256d reg_half = _mm256_mul_pd(reg, half.ymm);
1709 // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
1710 reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));
1711 // And perform one step of Newton-Raphson approximation to improve it
1712 // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
1713 // 0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
1714 const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);
1715 reg = _mm256_mul_pd(
1716 reg,
1717 _mm256_sub_pd(one_and_a_half,
1718 _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));
1719 XMMReg4Double ret;
1720 ret.ymm = reg;
1721 return ret;
1722 }
1723
1724 inline XMMReg4Float cast_to_float() const
1725 {
1726 XMMReg4Float ret;
1727 ret.xmm = _mm256_cvtpd_ps(ymm);
1728 return ret;
1729 }
1730
1731 inline void Store4Val(unsigned char *ptr) const
1732 {
1733 __m128i xmm_i =
1734 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1735 // xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1736 // xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1737 xmm_i =
1738 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1739 (12 << 24))); // SSSE3
1740 GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1741 }
1742
1743 inline void Store4Val(unsigned short *ptr) const
1744 {
1745 __m128i xmm_i =
1746 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1747 xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
1748 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1749 }
1750
1751 inline void Store4Val(float *ptr) const
1752 {
1753 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1754 }
1755
1756 inline void Store4Val(double *ptr) const
1757 {
1758 _mm256_storeu_pd(ptr, ymm);
1759 }
1760
1761 inline void StoreMask(unsigned char *ptr) const
1762 {
1763 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1764 _mm256_castpd_si256(ymm));
1765 }
1766};
1767
1768inline XMMReg4Double XMMReg4Float::cast_to_double() const
1769{
1770 XMMReg4Double ret;
1771 ret.ymm = _mm256_cvtps_pd(xmm);
1772 return ret;
1773}
1774
1775inline XMMReg4Double XMMReg4Int::cast_to_double() const
1776{
1777 XMMReg4Double ret;
1778 ret.ymm = _mm256_cvtepi32_pd(xmm);
1779 return ret;
1780}
1781
1782class XMMReg8Float
1783{
1784 public:
1785 __m256 ymm;
1786
1787 XMMReg8Float()
1788#if !defined(_MSC_VER)
1789 : ymm(_mm256_undefined_ps())
1790#endif
1791 {
1792 }
1793
1794 XMMReg8Float(const XMMReg8Float &other) : ymm(other.ymm)
1795 {
1796 }
1797
1798 static inline XMMReg8Float Set1(float f)
1799 {
1800 XMMReg8Float reg;
1801 reg.ymm = _mm256_set1_ps(f);
1802 return reg;
1803 }
1804
1805 static inline XMMReg8Float LoadAllVal(const float *ptr)
1806 {
1807 return Load8Val(ptr);
1808 }
1809
1810 static inline XMMReg8Float Load8Val(const float *ptr)
1811 {
1812 XMMReg8Float reg;
1813 reg.nsLoad8Val(ptr);
1814 return reg;
1815 }
1816
1817 static inline XMMReg8Float Load8Val(const int *ptr)
1818 {
1819 XMMReg8Float reg;
1820 reg.nsLoad8Val(ptr);
1821 return reg;
1822 }
1823
1824 static inline XMMReg8Float Max(const XMMReg8Float &expr1,
1825 const XMMReg8Float &expr2)
1826 {
1827 XMMReg8Float reg;
1828 reg.ymm = _mm256_max_ps(expr1.ymm, expr2.ymm);
1829 return reg;
1830 }
1831
1832 inline void nsLoad8Val(const float *ptr)
1833 {
1834 ymm = _mm256_loadu_ps(ptr);
1835 }
1836
1837 inline void nsLoad8Val(const int *ptr)
1838 {
1839 const __m256i ymm_i =
1840 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
1841 ymm = _mm256_cvtepi32_ps(ymm_i);
1842 }
1843
1844 inline XMMReg8Float &operator=(const XMMReg8Float &other)
1845 {
1846 ymm = other.ymm;
1847 return *this;
1848 }
1849
1850 inline XMMReg8Float &operator+=(const XMMReg8Float &other)
1851 {
1852 ymm = _mm256_add_ps(ymm, other.ymm);
1853 return *this;
1854 }
1855
1856 inline XMMReg8Float &operator-=(const XMMReg8Float &other)
1857 {
1858 ymm = _mm256_sub_ps(ymm, other.ymm);
1859 return *this;
1860 }
1861
1862 inline XMMReg8Float &operator*=(const XMMReg8Float &other)
1863 {
1864 ymm = _mm256_mul_ps(ymm, other.ymm);
1865 return *this;
1866 }
1867
1868 inline XMMReg8Float operator+(const XMMReg8Float &other) const
1869 {
1870 XMMReg8Float ret;
1871 ret.ymm = _mm256_add_ps(ymm, other.ymm);
1872 return ret;
1873 }
1874
1875 inline XMMReg8Float operator-(const XMMReg8Float &other) const
1876 {
1877 XMMReg8Float ret;
1878 ret.ymm = _mm256_sub_ps(ymm, other.ymm);
1879 return ret;
1880 }
1881
1882 inline XMMReg8Float operator*(const XMMReg8Float &other) const
1883 {
1884 XMMReg8Float ret;
1885 ret.ymm = _mm256_mul_ps(ymm, other.ymm);
1886 return ret;
1887 }
1888
1889 inline XMMReg8Float operator/(const XMMReg8Float &other) const
1890 {
1891 XMMReg8Float ret;
1892 ret.ymm = _mm256_div_ps(ymm, other.ymm);
1893 return ret;
1894 }
1895
1896 inline XMMReg8Float inverse() const
1897 {
1898 XMMReg8Float ret;
1899 ret.ymm = _mm256_div_ps(_mm256_set1_ps(1.0f), ymm);
1900 return ret;
1901 }
1902
1903 inline XMMReg8Float approx_inv_sqrt(const XMMReg8Float &one,
1904 const XMMReg8Float &half) const
1905 {
1906 __m256 reg = ymm;
1907 __m256 reg_half = _mm256_mul_ps(reg, half.ymm);
1908 // Compute rough approximation of 1 / sqrt(b) with _mm256_rsqrt_ps
1909 reg = _mm256_rsqrt_ps(reg);
1910 // And perform one step of Newton-Raphson approximation to improve it
1911 // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
1912 // 0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
1913 const __m256 one_and_a_half = _mm256_add_ps(one.ymm, half.ymm);
1914 reg = _mm256_mul_ps(
1915 reg,
1916 _mm256_sub_ps(one_and_a_half,
1917 _mm256_mul_ps(reg_half, _mm256_mul_ps(reg, reg))));
1918 XMMReg8Float ret;
1919 ret.ymm = reg;
1920 return ret;
1921 }
1922
1923 inline void StoreAllVal(float *ptr) const
1924 {
1925 Store8Val(ptr);
1926 }
1927
1928 inline void Store8Val(float *ptr) const
1929 {
1930 _mm256_storeu_ps(ptr, ymm);
1931 }
1932
1933 XMMReg8Float cast_to_float() const
1934 {
1935 return *this;
1936 }
1937};
1938
1939#if defined(__AVX2__)
1940
1941class XMMReg8Int
1942{
1943 public:
1944 __m256i ymm;
1945
1946 XMMReg8Int()
1947#if !defined(_MSC_VER)
1948 : ymm(_mm256_undefined_si256())
1949#endif
1950 {
1951 }
1952
1953 XMMReg8Int(const XMMReg8Int &other) : ymm(other.ymm)
1954 {
1955 }
1956
1957 inline XMMReg8Int &operator=(const XMMReg8Int &other)
1958 {
1959 ymm = other.ymm;
1960 return *this;
1961 }
1962
1963 static inline XMMReg8Int Zero()
1964 {
1965 XMMReg8Int reg;
1966 reg.ymm = _mm256_setzero_si256();
1967 return reg;
1968 }
1969
1970 static inline XMMReg8Int Set1(int i)
1971 {
1972 XMMReg8Int reg;
1973 reg.ymm = _mm256_set1_epi32(i);
1974 return reg;
1975 }
1976
1977 static inline XMMReg8Int LoadAllVal(const int *ptr)
1978 {
1979 return Load8Val(ptr);
1980 }
1981
1982 static inline XMMReg8Int Load8Val(const int *ptr)
1983 {
1984 XMMReg8Int reg;
1985 reg.nsLoad8Val(ptr);
1986 return reg;
1987 }
1988
1989 inline void nsLoad8Val(const int *ptr)
1990 {
1991 ymm = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
1992 }
1993
1994 static inline XMMReg8Int Equals(const XMMReg8Int &expr1,
1995 const XMMReg8Int &expr2)
1996 {
1997 XMMReg8Int reg;
1998 reg.ymm = _mm256_cmpeq_epi32(expr1.ymm, expr2.ymm);
1999 return reg;
2000 }
2001
2002 static inline XMMReg8Int Ternary(const XMMReg8Int &cond,
2003 const XMMReg8Int &true_expr,
2004 const XMMReg8Int &false_expr)
2005 {
2006 XMMReg8Int reg;
2007 reg.ymm =
2008 _mm256_or_si256(_mm256_and_si256(cond.ymm, true_expr.ymm),
2009 _mm256_andnot_si256(cond.ymm, false_expr.ymm));
2010 return reg;
2011 }
2012
2013 inline XMMReg8Int &operator+=(const XMMReg8Int &other)
2014 {
2015 ymm = _mm256_add_epi32(ymm, other.ymm);
2016 return *this;
2017 }
2018
2019 inline XMMReg8Int &operator-=(const XMMReg8Int &other)
2020 {
2021 ymm = _mm256_sub_epi32(ymm, other.ymm);
2022 return *this;
2023 }
2024
2025 inline XMMReg8Int operator+(const XMMReg8Int &other) const
2026 {
2027 XMMReg8Int ret;
2028 ret.ymm = _mm256_add_epi32(ymm, other.ymm);
2029 return ret;
2030 }
2031
2032 inline XMMReg8Int operator-(const XMMReg8Int &other) const
2033 {
2034 XMMReg8Int ret;
2035 ret.ymm = _mm256_sub_epi32(ymm, other.ymm);
2036 return ret;
2037 }
2038
2039 XMMReg8Float cast_to_float() const
2040 {
2041 XMMReg8Float ret;
2042 ret.ymm = _mm256_cvtepi32_ps(ymm);
2043 return ret;
2044 }
2045};
2046
2047#endif
2048
2049#else
2050
2051class XMMReg4Double
2052{
2053 public:
2054 XMMReg2Double low, high;
2055
2056 XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())
2057 {
2058 }
2059
2060 XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
2061 {
2062 }
2063
2064 static inline XMMReg4Double Zero()
2065 {
2066 XMMReg4Double reg;
2067 reg.low.Zeroize();
2068 reg.high.Zeroize();
2069 return reg;
2070 }
2071
2072 static inline XMMReg4Double Set1(double d)
2073 {
2074 XMMReg4Double reg;
2075 reg.low = XMMReg2Double::Set1(d);
2076 reg.high = XMMReg2Double::Set1(d);
2077 return reg;
2078 }
2079
2080 static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
2081 {
2082 XMMReg4Double reg;
2083 reg.low.nsLoad1ValHighAndLow(ptr);
2084 reg.high = reg.low;
2085 return reg;
2086 }
2087
2088 static inline XMMReg4Double Load4Val(const unsigned char *ptr)
2089 {
2090 XMMReg4Double reg;
2091 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
2092 return reg;
2093 }
2094
2095 static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
2096 XMMReg4Double &high)
2097 {
2098 low = Load4Val(ptr);
2099 high = Load4Val(ptr + 4);
2100 }
2101
2102 static inline XMMReg4Double Load4Val(const short *ptr)
2103 {
2104 XMMReg4Double reg;
2105 reg.low.nsLoad2Val(ptr);
2106 reg.high.nsLoad2Val(ptr + 2);
2107 return reg;
2108 }
2109
2110 static inline void Load8Val(const short *ptr, XMMReg4Double &low,
2111 XMMReg4Double &high)
2112 {
2113 low = Load4Val(ptr);
2114 high = Load4Val(ptr + 4);
2115 }
2116
2117 static inline XMMReg4Double Load4Val(const unsigned short *ptr)
2118 {
2119 XMMReg4Double reg;
2120 reg.low.nsLoad2Val(ptr);
2121 reg.high.nsLoad2Val(ptr + 2);
2122 return reg;
2123 }
2124
2125 static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
2126 XMMReg4Double &high)
2127 {
2128 low = Load4Val(ptr);
2129 high = Load4Val(ptr + 4);
2130 }
2131
2132 static inline XMMReg4Double Load4Val(const int *ptr)
2133 {
2134 XMMReg4Double reg;
2135 reg.low.nsLoad2Val(ptr);
2136 reg.high.nsLoad2Val(ptr + 2);
2137 return reg;
2138 }
2139
2140 static inline void Load8Val(const int *ptr, XMMReg4Double &low,
2141 XMMReg4Double &high)
2142 {
2143 low = Load4Val(ptr);
2144 high = Load4Val(ptr + 4);
2145 }
2146
2147 static inline XMMReg4Double Load4Val(const double *ptr)
2148 {
2149 XMMReg4Double reg;
2150 reg.low.nsLoad2Val(ptr);
2151 reg.high.nsLoad2Val(ptr + 2);
2152 return reg;
2153 }
2154
2155 static inline void Load8Val(const double *ptr, XMMReg4Double &low,
2156 XMMReg4Double &high)
2157 {
2158 low = Load4Val(ptr);
2159 high = Load4Val(ptr + 4);
2160 }
2161
2162 static inline XMMReg4Double Load4ValAligned(const double *ptr)
2163 {
2164 XMMReg4Double reg;
2165 reg.low.nsLoad2ValAligned(ptr);
2166 reg.high.nsLoad2ValAligned(ptr + 2);
2167 return reg;
2168 }
2169
2170 static inline XMMReg4Double Load4Val(const float *ptr)
2171 {
2172 XMMReg4Double reg;
2173 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
2174 return reg;
2175 }
2176
2177 static inline void Load8Val(const float *ptr, XMMReg4Double &low,
2178 XMMReg4Double &high)
2179 {
2180 low = Load4Val(ptr);
2181 high = Load4Val(ptr + 4);
2182 }
2183
2184 static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
2185 const XMMReg4Double &expr2)
2186 {
2187 XMMReg4Double reg;
2188 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
2189 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
2190 return reg;
2191 }
2192
2193 static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
2194 const XMMReg4Double &expr2)
2195 {
2196 XMMReg4Double reg;
2197 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
2198 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
2199 return reg;
2200 }
2201
2202 static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
2203 const XMMReg4Double &expr2)
2204 {
2205 XMMReg4Double reg;
2206 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
2207 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
2208 return reg;
2209 }
2210
2211 static inline XMMReg4Double And(const XMMReg4Double &expr1,
2212 const XMMReg4Double &expr2)
2213 {
2214 XMMReg4Double reg;
2215 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
2216 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
2217 return reg;
2218 }
2219
2220 static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
2221 const XMMReg4Double &true_expr,
2222 const XMMReg4Double &false_expr)
2223 {
2224 XMMReg4Double reg;
2225 reg.low =
2226 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
2227 reg.high =
2228 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
2229 return reg;
2230 }
2231
2232 static inline XMMReg4Double Min(const XMMReg4Double &expr1,
2233 const XMMReg4Double &expr2)
2234 {
2235 XMMReg4Double reg;
2236 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
2237 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
2238 return reg;
2239 }
2240
2241 inline XMMReg4Double &operator=(const XMMReg4Double &other)
2242 {
2243 low = other.low;
2244 high = other.high;
2245 return *this;
2246 }
2247
2248 inline XMMReg4Double &operator+=(const XMMReg4Double &other)
2249 {
2250 low += other.low;
2251 high += other.high;
2252 return *this;
2253 }
2254
2255 inline XMMReg4Double &operator*=(const XMMReg4Double &other)
2256 {
2257 low *= other.low;
2258 high *= other.high;
2259 return *this;
2260 }
2261
2262 inline XMMReg4Double operator+(const XMMReg4Double &other) const
2263 {
2264 XMMReg4Double ret;
2265 ret.low = low + other.low;
2266 ret.high = high + other.high;
2267 return ret;
2268 }
2269
2270 inline XMMReg4Double operator-(const XMMReg4Double &other) const
2271 {
2272 XMMReg4Double ret;
2273 ret.low = low - other.low;
2274 ret.high = high - other.high;
2275 return ret;
2276 }
2277
2278 inline XMMReg4Double operator*(const XMMReg4Double &other) const
2279 {
2280 XMMReg4Double ret;
2281 ret.low = low * other.low;
2282 ret.high = high * other.high;
2283 return ret;
2284 }
2285
2286 inline XMMReg4Double operator/(const XMMReg4Double &other) const
2287 {
2288 XMMReg4Double ret;
2289 ret.low = low / other.low;
2290 ret.high = high / other.high;
2291 return ret;
2292 }
2293
2294 void AddToLow(const XMMReg2Double &other)
2295 {
2296 low += other;
2297 }
2298
2299 inline double GetHorizSum() const
2300 {
2301 return (low + high).GetHorizSum();
2302 }
2303
2304#if !defined(USE_SSE2_EMULATION)
2305 inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
2306 const XMMReg4Double &half) const
2307 {
2308 __m128d reg0 = low.xmm;
2309 __m128d reg1 = high.xmm;
2310 __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);
2311 __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);
2312 // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
2313 reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));
2314 reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));
2315 // And perform one step of Newton-Raphson approximation to improve it
2316 // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
2317 // 0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
2318 const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);
2319 reg0 = _mm_mul_pd(
2320 reg0, _mm_sub_pd(one_and_a_half,
2321 _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));
2322 reg1 = _mm_mul_pd(
2323 reg1, _mm_sub_pd(one_and_a_half,
2324 _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));
2325 XMMReg4Double ret;
2326 ret.low.xmm = reg0;
2327 ret.high.xmm = reg1;
2328 return ret;
2329 }
2330
2331 inline XMMReg4Float cast_to_float() const
2332 {
2333 XMMReg4Float ret;
2334 ret.xmm = _mm_castsi128_ps(
2335 _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),
2336 _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));
2337 return ret;
2338 }
2339#endif
2340
2341 inline void Store4Val(unsigned char *ptr) const
2342 {
2343#ifdef USE_SSE2_EMULATION
2344 low.Store2Val(ptr);
2345 high.Store2Val(ptr + 2);
2346#else
2347 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
2348 low.xmm,
2349 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2350 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
2351 high.xmm,
2352 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2353 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
2354 _mm_castsi128_ps(tmpHigh),
2355 _MM_SHUFFLE(1, 0, 1, 0)));
2356 tmp = _mm_packs_epi32(tmp, tmp);
2357 tmp = _mm_packus_epi16(tmp, tmp);
2358 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
2359#endif
2360 }
2361
2362 inline void Store4Val(unsigned short *ptr) const
2363 {
2364#if 1
2365 low.Store2Val(ptr);
2366 high.Store2Val(ptr + 2);
2367#else
2368 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
2369 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
2370 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
2371#if __SSE4_1__
2372 xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
2373#else
2374 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
2375 xmm0 = _mm_packs_epi32(xmm0, xmm0);
2376 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
2377#endif
2378 GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
2379#endif
2380 }
2381
2382 inline void Store4Val(float *ptr) const
2383 {
2384 low.Store2Val(ptr);
2385 high.Store2Val(ptr + 2);
2386 }
2387
2388 inline void Store4Val(double *ptr) const
2389 {
2390 low.Store2Val(ptr);
2391 high.Store2Val(ptr + 2);
2392 }
2393
2394 inline void StoreMask(unsigned char *ptr) const
2395 {
2396 low.StoreMask(ptr);
2397 high.StoreMask(ptr + 16);
2398 }
2399};
2400
2401#if !defined(USE_SSE2_EMULATION)
2402inline XMMReg4Double XMMReg4Float::cast_to_double() const
2403{
2404 XMMReg4Double ret;
2405 ret.low.xmm = _mm_cvtps_pd(xmm);
2406 ret.high.xmm = _mm_cvtps_pd(
2407 _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));
2408 return ret;
2409}
2410
2411inline XMMReg4Double XMMReg4Int::cast_to_double() const
2412{
2413 XMMReg4Double ret;
2414 ret.low.xmm = _mm_cvtepi32_pd(xmm);
2415 ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));
2416 return ret;
2417}
2418#endif
2419
2420#endif
2421
2422#endif /* #ifndef DOXYGEN_SKIP */
2423
2424#endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition cpl_port.h:161
GIntBig GInt64
Signed 64 bit integer type.
Definition cpl_port.h:216
unsigned short GUInt16
Unsigned int16 type.
Definition cpl_port.h:163
int GInt32
Int32 type.
Definition cpl_port.h:155