GDAL
gdalsse_priv.h
1 /******************************************************************************
2  * $Id: gdalsse_priv.h 40153 2017-09-17 13:28:32Z rouault $
3  *
4  * Project: GDAL
5  * Purpose: SSE2 helper
6  * Author: Even Rouault <even dot rouault at spatialys dot com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32 
33 #ifndef DOXYGEN_SKIP
34 
35 #include "cpl_port.h"
36 
37 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38 /* Could possibly be used too on 32bit, but we would need to check at runtime */
39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
40 
41 /* Requires SSE2 */
42 #include <emmintrin.h>
43 #include <string.h>
44 
45 #ifdef __SSE4_1__
46 #include <smmintrin.h>
47 #endif
48 
49 #include "gdal_priv_templates.hpp"
50 
51 static inline __m128i GDALCopyInt16ToXMM(const void* ptr)
52 {
53 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
54  unsigned short s;
55  memcpy(&s, ptr, 2);
56  return _mm_cvtsi32_si128(s);
57 #else
58  return _mm_cvtsi32_si128(*(unsigned short*)(ptr));
59 #endif
60 }
61 
62 static inline __m128i GDALCopyInt32ToXMM(const void* ptr)
63 {
64 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
65  GInt32 i;
66  memcpy(&i, ptr, 4);
67  return _mm_cvtsi32_si128(i);
68 #else
69  return _mm_cvtsi32_si128(*(GInt32*)(ptr));
70 #endif
71 }
72 
73 static inline __m128i GDALCopyInt64ToXMM(const void* ptr)
74 {
75 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
76  GInt64 i;
77  memcpy(&i, ptr, 8);
78  return _mm_cvtsi64_si128(i);
79 #else
80  return _mm_cvtsi64_si128(*(GInt64*)(ptr));
81 #endif
82 }
83 
84 static inline void GDALCopyXMMToInt16(const __m128i xmm, void* pDest)
85 {
86 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
87  GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
88  memcpy(pDest, &i, 2);
89 #else
90  *static_cast<GInt16*>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
91 #endif
92 }
93 
94 class XMMReg2Double
95 {
96  public:
97  __m128d xmm;
98 
99  /* coverity[uninit_member] */
100  XMMReg2Double() {}
101 
102  XMMReg2Double(double val) { xmm = _mm_load_sd (&val); }
103  XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
104 
105  static inline XMMReg2Double Zero()
106  {
107  XMMReg2Double reg;
108  reg.Zeroize();
109  return reg;
110  }
111 
112  static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
113  {
114  XMMReg2Double reg;
115  reg.nsLoad1ValHighAndLow(ptr);
116  return reg;
117  }
118 
119  static inline XMMReg2Double Load2Val(const double* ptr)
120  {
121  XMMReg2Double reg;
122  reg.nsLoad2Val(ptr);
123  return reg;
124  }
125 
126  static inline XMMReg2Double Load2Val(const float* ptr)
127  {
128  XMMReg2Double reg;
129  reg.nsLoad2Val(ptr);
130  return reg;
131  }
132 
133  static inline XMMReg2Double Load2ValAligned(const double* ptr)
134  {
135  XMMReg2Double reg;
136  reg.nsLoad2ValAligned(ptr);
137  return reg;
138  }
139 
140  static inline XMMReg2Double Load2Val(const unsigned char* ptr)
141  {
142  XMMReg2Double reg;
143  reg.nsLoad2Val(ptr);
144  return reg;
145  }
146 
147  static inline XMMReg2Double Load2Val(const short* ptr)
148  {
149  XMMReg2Double reg;
150  reg.nsLoad2Val(ptr);
151  return reg;
152  }
153 
154  static inline XMMReg2Double Load2Val(const unsigned short* ptr)
155  {
156  XMMReg2Double reg;
157  reg.nsLoad2Val(ptr);
158  return reg;
159  }
160 
161  static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
162  {
163  XMMReg2Double reg;
164  reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
165  return reg;
166  }
167 
168  static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
169  {
170  XMMReg2Double reg;
171  reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
172  return reg;
173  }
174 
175  static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
176  {
177  XMMReg2Double reg;
178  reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
179  return reg;
180  }
181 
182  static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
183  {
184  XMMReg2Double reg;
185  reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
186  return reg;
187  }
188 
189  static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
190  {
191  XMMReg2Double reg;
192  reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
193  return reg;
194  }
195 
196  static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
197  {
198  XMMReg2Double reg;
199  reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
200  return reg;
201  }
202 
203  inline void nsLoad1ValHighAndLow(const double* ptr)
204  {
205  xmm = _mm_load1_pd(ptr);
206  }
207 
208  inline void nsLoad2Val(const double* ptr)
209  {
210  xmm = _mm_loadu_pd(ptr);
211  }
212 
213  inline void nsLoad2ValAligned(const double* ptr)
214  {
215  xmm = _mm_load_pd(ptr);
216  }
217 
218  inline void nsLoad2Val(const float* ptr)
219  {
220  xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
221  }
222 
223  inline void nsLoad2Val(const unsigned char* ptr)
224  {
225  __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
226 #ifdef __SSE4_1__
227  xmm_i = _mm_cvtepu8_epi32(xmm_i);
228 #else
229  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
230  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
231 #endif
232  xmm = _mm_cvtepi32_pd(xmm_i);
233  }
234 
235  inline void nsLoad2Val(const short* ptr)
236  {
237  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
238 #ifdef __SSE4_1__
239  xmm_i = _mm_cvtepi16_epi32(xmm_i);
240 #else
241  xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
242  xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
243 #endif
244  xmm = _mm_cvtepi32_pd(xmm_i);
245  }
246 
247  inline void nsLoad2Val(const unsigned short* ptr)
248  {
249  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
250 #ifdef __SSE4_1__
251  xmm_i = _mm_cvtepu16_epi32(xmm_i);
252 #else
253  xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
254 #endif
255  xmm = _mm_cvtepi32_pd(xmm_i);
256  }
257 
258  static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
259  {
260  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
261 #ifdef __SSE4_1__
262  xmm_i = _mm_cvtepu8_epi32(xmm_i);
263 #else
264  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
265  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
266 #endif
267  low.xmm = _mm_cvtepi32_pd(xmm_i);
268  high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
269  }
270 
271  static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
272  {
273  low.nsLoad2Val(ptr);
274  high.nsLoad2Val(ptr+2);
275  }
276 
277  static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
278  {
279  low.nsLoad2Val(ptr);
280  high.nsLoad2Val(ptr+2);
281  }
282 
283  static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
284  {
285  low.nsLoad2Val(ptr);
286  high.nsLoad2Val(ptr+2);
287  }
288 
289  static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
290  {
291  __m128 temp1 = _mm_loadu_ps(ptr);
292  __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
293  low.xmm = _mm_cvtps_pd(temp1);
294  high.xmm = _mm_cvtps_pd(temp2);
295  }
296 
297  inline void Zeroize()
298  {
299  xmm = _mm_setzero_pd();
300  }
301 
302  inline XMMReg2Double& operator= (const XMMReg2Double& other)
303  {
304  xmm = other.xmm;
305  return *this;
306  }
307 
308  inline XMMReg2Double& operator+= (const XMMReg2Double& other)
309  {
310  xmm = _mm_add_pd(xmm, other.xmm);
311  return *this;
312  }
313 
314  inline XMMReg2Double& operator*= (const XMMReg2Double& other)
315  {
316  xmm = _mm_mul_pd(xmm, other.xmm);
317  return *this;
318  }
319 
320  inline XMMReg2Double operator+ (const XMMReg2Double& other) const
321  {
322  XMMReg2Double ret;
323  ret.xmm = _mm_add_pd(xmm, other.xmm);
324  return ret;
325  }
326 
327  inline XMMReg2Double operator- (const XMMReg2Double& other) const
328  {
329  XMMReg2Double ret;
330  ret.xmm = _mm_sub_pd(xmm, other.xmm);
331  return ret;
332  }
333 
334  inline XMMReg2Double operator* (const XMMReg2Double& other) const
335  {
336  XMMReg2Double ret;
337  ret.xmm = _mm_mul_pd(xmm, other.xmm);
338  return ret;
339  }
340 
341  inline XMMReg2Double operator/ (const XMMReg2Double& other) const
342  {
343  XMMReg2Double ret;
344  ret.xmm = _mm_div_pd(xmm, other.xmm);
345  return ret;
346  }
347 
348  inline double GetHorizSum() const
349  {
350  __m128d xmm2;
351  xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
352  return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
353  }
354 
355  inline void Store2Val(double* ptr) const
356  {
357  _mm_storeu_pd(ptr, xmm);
358  }
359 
360  inline void Store2ValAligned(double* ptr) const
361  {
362  _mm_store_pd(ptr, xmm);
363  }
364 
365  inline void Store2Val(float* ptr) const
366  {
367  __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
368  GDALCopyXMMToInt64(xmm_i, (GInt64*)ptr);
369  }
370 
371  inline void Store2Val(unsigned char* ptr) const
372  {
373  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
374  // X X X X 0 B 0 A --> 0 X X X X 0 B 0 (srli)
375  // or X X X X 0 B 0 A
376  tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
377  tmp = _mm_packus_epi16(tmp, tmp);
378  GDALCopyXMMToInt16(tmp, (GInt16*)ptr);
379  }
380 
381  inline void Store2Val(unsigned short* ptr) const
382  {
383  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
384  // X X X X 0 B 0 A --> 0 X X X X 0 B 0 (srli)
385  // or X X X X 0 B 0 A
386  tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
387  GDALCopyXMMToInt32(tmp, (GInt32*)ptr);
388 
389  //ptr[0] = (GUInt16)_mm_extract_epi16(tmp, 0);
390  //ptr[1] = (GUInt16)_mm_extract_epi16(tmp, 2);
391  }
392 
393  inline void StoreMask(unsigned char* ptr) const
394  {
395  _mm_storeu_si128( (__m128i*)ptr, _mm_castpd_si128(xmm) );
396  }
397 
398  inline operator double () const
399  {
400  return _mm_cvtsd_f64(xmm);
401  }
402 };
403 
404 #else
405 
406 #warning "Software emulation of SSE2 !"
407 
408 class XMMReg2Double
409 {
410  public:
411  double low;
412  double high;
413 
414  XMMReg2Double() {}
415  XMMReg2Double(double val) { low = val; high = 0.0; }
416  XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
417 
418  static inline XMMReg2Double Zero()
419  {
420  XMMReg2Double reg;
421  reg.Zeroize();
422  return reg;
423  }
424 
425  static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
426  {
427  XMMReg2Double reg;
428  reg.nsLoad1ValHighAndLow(ptr);
429  return reg;
430  }
431 
432  static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
433  {
434  XMMReg2Double reg;
435 
436  if (expr1.low == expr2.low)
437  memset(&(reg.low), 0xFF, sizeof(double));
438  else
439  reg.low = 0;
440 
441  if (expr1.high == expr2.high)
442  memset(&(reg.high), 0xFF, sizeof(double));
443  else
444  reg.high = 0;
445 
446  return reg;
447  }
448 
449  static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
450  {
451  XMMReg2Double reg;
452 
453  if (expr1.low != expr2.low)
454  memset(&(reg.low), 0xFF, sizeof(double));
455  else
456  reg.low = 0;
457 
458  if (expr1.high != expr2.high)
459  memset(&(reg.high), 0xFF, sizeof(double));
460  else
461  reg.high = 0;
462 
463  return reg;
464  }
465 
466  static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
467  {
468  XMMReg2Double reg;
469 
470  if (expr1.low > expr2.low)
471  memset(&(reg.low), 0xFF, sizeof(double));
472  else
473  reg.low = 0;
474 
475  if (expr1.high > expr2.high)
476  memset(&(reg.high), 0xFF, sizeof(double));
477  else
478  reg.high = 0;
479 
480  return reg;
481  }
482 
483  static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
484  {
485  XMMReg2Double reg;
486  int low1[2], high1[2];
487  int low2[2], high2[2];
488  memcpy(low1, &expr1.low, sizeof(double));
489  memcpy(high1, &expr1.high, sizeof(double));
490  memcpy(low2, &expr2.low, sizeof(double));
491  memcpy(high2, &expr2.high, sizeof(double));
492  low1[0] &= low2[0];
493  low1[1] &= low2[1];
494  high1[0] &= high2[0];
495  high1[1] &= high2[1];
496  memcpy(&reg.low, low1, sizeof(double));
497  memcpy(&reg.high, high1, sizeof(double));
498  return reg;
499  }
500 
501  static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
502  {
503  XMMReg2Double reg;
504  if( cond.low )
505  reg.low = true_expr.low;
506  else
507  reg.low = false_expr.low;
508  if( cond.high )
509  reg.high = true_expr.high;
510  else
511  reg.high = false_expr.high;
512  return reg;
513  }
514 
515  static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
516  {
517  XMMReg2Double reg;
518  reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
519  reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
520  return reg;
521  }
522 
523  static inline XMMReg2Double Load2Val(const double* ptr)
524  {
525  XMMReg2Double reg;
526  reg.nsLoad2Val(ptr);
527  return reg;
528  }
529 
530  static inline XMMReg2Double Load2ValAligned(const double* ptr)
531  {
532  XMMReg2Double reg;
533  reg.nsLoad2ValAligned(ptr);
534  return reg;
535  }
536 
537  static inline XMMReg2Double Load2Val(const float* ptr)
538  {
539  XMMReg2Double reg;
540  reg.nsLoad2Val(ptr);
541  return reg;
542  }
543 
544  static inline XMMReg2Double Load2Val(const unsigned char* ptr)
545  {
546  XMMReg2Double reg;
547  reg.nsLoad2Val(ptr);
548  return reg;
549  }
550 
551  static inline XMMReg2Double Load2Val(const short* ptr)
552  {
553  XMMReg2Double reg;
554  reg.nsLoad2Val(ptr);
555  return reg;
556  }
557 
558  static inline XMMReg2Double Load2Val(const unsigned short* ptr)
559  {
560  XMMReg2Double reg;
561  reg.nsLoad2Val(ptr);
562  return reg;
563  }
564 
565  inline void nsLoad1ValHighAndLow(const double* ptr)
566  {
567  low = ptr[0];
568  high = ptr[0];
569  }
570 
571  inline void nsLoad2Val(const double* ptr)
572  {
573  low = ptr[0];
574  high = ptr[1];
575  }
576 
577  inline void nsLoad2ValAligned(const double* ptr)
578  {
579  low = ptr[0];
580  high = ptr[1];
581  }
582 
583  inline void nsLoad2Val(const float* ptr)
584  {
585  low = ptr[0];
586  high = ptr[1];
587  }
588 
589  inline void nsLoad2Val(const unsigned char* ptr)
590  {
591  low = ptr[0];
592  high = ptr[1];
593  }
594 
595  inline void nsLoad2Val(const short* ptr)
596  {
597  low = ptr[0];
598  high = ptr[1];
599  }
600 
601  inline void nsLoad2Val(const unsigned short* ptr)
602  {
603  low = ptr[0];
604  high = ptr[1];
605  }
606 
607  static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
608  {
609  low.low = ptr[0];
610  low.high = ptr[1];
611  high.low = ptr[2];
612  high.high = ptr[3];
613  }
614 
615  static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
616  {
617  low.nsLoad2Val(ptr);
618  high.nsLoad2Val(ptr+2);
619  }
620 
621  static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
622  {
623  low.nsLoad2Val(ptr);
624  high.nsLoad2Val(ptr+2);
625  }
626 
627  static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
628  {
629  low.nsLoad2Val(ptr);
630  high.nsLoad2Val(ptr+2);
631  }
632 
633  static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
634  {
635  low.nsLoad2Val(ptr);
636  high.nsLoad2Val(ptr+2);
637  }
638 
639  inline void Zeroize()
640  {
641  low = 0.0;
642  high = 0.0;
643  }
644 
645  inline XMMReg2Double& operator= (const XMMReg2Double& other)
646  {
647  low = other.low;
648  high = other.high;
649  return *this;
650  }
651 
652  inline XMMReg2Double& operator+= (const XMMReg2Double& other)
653  {
654  low += other.low;
655  high += other.high;
656  return *this;
657  }
658 
659  inline XMMReg2Double& operator*= (const XMMReg2Double& other)
660  {
661  low *= other.low;
662  high *= other.high;
663  return *this;
664  }
665 
666  inline XMMReg2Double operator+ (const XMMReg2Double& other) const
667  {
668  XMMReg2Double ret;
669  ret.low = low + other.low;
670  ret.high = high + other.high;
671  return ret;
672  }
673 
674  inline XMMReg2Double operator- (const XMMReg2Double& other) const
675  {
676  XMMReg2Double ret;
677  ret.low = low - other.low;
678  ret.high = high - other.high;
679  return ret;
680  }
681 
682  inline XMMReg2Double operator* (const XMMReg2Double& other) const
683  {
684  XMMReg2Double ret;
685  ret.low = low * other.low;
686  ret.high = high * other.high;
687  return ret;
688  }
689 
690  inline XMMReg2Double operator/ (const XMMReg2Double& other) const
691  {
692  XMMReg2Double ret;
693  ret.low = low / other.low;
694  ret.high = high / other.high;
695  return ret;
696  }
697 
698  inline double GetHorizSum() const
699  {
700  return low + high;
701  }
702 
703  inline void Store2Val(double* ptr) const
704  {
705  ptr[0] = low;
706  ptr[1] = high;
707  }
708 
709  inline void Store2ValAligned(double* ptr) const
710  {
711  ptr[0] = low;
712  ptr[1] = high;
713  }
714 
715  inline void Store2Val(float* ptr) const
716  {
717  ptr[0] = low;
718  ptr[1] = high;
719  }
720 
721  void Store2Val(unsigned char* ptr) const
722  {
723  ptr[0] = (unsigned char)(low + 0.5);
724  ptr[1] = (unsigned char)(high + 0.5);
725  }
726 
727  void Store2Val(unsigned short* ptr) const
728  {
729  ptr[0] = (GUInt16)(low + 0.5);
730  ptr[1] = (GUInt16)(high + 0.5);
731  }
732 
733  inline void StoreMask(unsigned char* ptr) const
734  {
735  memcpy(ptr, &low, 8);
736  memcpy(ptr + 8, &high, 8);
737  }
738 
739  inline operator double () const
740  {
741  return low;
742  }
743 };
744 
745 #endif /* defined(__x86_64) || defined(_M_X64) */
746 
747 #ifdef __AVX__
748 
749 #include <immintrin.h>
750 
751 class XMMReg4Double
752 {
753  public:
754  __m256d ymm;
755 
756  XMMReg4Double() {}
757  XMMReg4Double(const XMMReg4Double& other) : ymm(other.ymm) {}
758 
759  static inline XMMReg4Double Zero()
760  {
761  XMMReg4Double reg;
762  reg.Zeroize();
763  return reg;
764  }
765 
766  inline void Zeroize()
767  {
768  ymm = _mm256_setzero_pd();
769  }
770 
771  static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
772  {
773  XMMReg4Double reg;
774  reg.nsLoad1ValHighAndLow(ptr);
775  return reg;
776  }
777 
778  inline void nsLoad1ValHighAndLow(const double* ptr)
779  {
780  ymm = _mm256_set1_pd(*ptr);
781  }
782 
783  static inline XMMReg4Double Load4Val(const unsigned char* ptr)
784  {
785  XMMReg4Double reg;
786  reg.nsLoad4Val(ptr);
787  return reg;
788  }
789 
790  inline void nsLoad4Val(const unsigned char* ptr)
791  {
792  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
793  xmm_i = _mm_cvtepu8_epi32(xmm_i);
794  ymm = _mm256_cvtepi32_pd(xmm_i);
795  }
796 
797  static inline XMMReg4Double Load4Val(const short* ptr)
798  {
799  XMMReg4Double reg;
800  reg.nsLoad4Val(ptr);
801  return reg;
802  }
803 
804  inline void nsLoad4Val(const short* ptr)
805  {
806  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
807  xmm_i = _mm_cvtepi16_epi32(xmm_i);
808  ymm = _mm256_cvtepi32_pd(xmm_i);
809  }
810 
811  static inline XMMReg4Double Load4Val(const unsigned short* ptr)
812  {
813  XMMReg4Double reg;
814  reg.nsLoad4Val(ptr);
815  return reg;
816  }
817 
818  inline void nsLoad4Val(const unsigned short* ptr)
819  {
820  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
821  xmm_i = _mm_cvtepu16_epi32(xmm_i);
822  ymm = _mm256_cvtepi32_pd(xmm_i); // ok to use signed conversion since we are in the ushort range, so cannot be interpreted as negative int32
823  }
824 
825  static inline XMMReg4Double Load4Val(const double* ptr)
826  {
827  XMMReg4Double reg;
828  reg.nsLoad4Val(ptr);
829  return reg;
830  }
831 
832  inline void nsLoad4Val(const double* ptr)
833  {
834  ymm = _mm256_loadu_pd(ptr);
835  }
836 
837  static inline XMMReg4Double Load4ValAligned(const double* ptr)
838  {
839  XMMReg4Double reg;
840  reg.nsLoad4ValAligned(ptr);
841  return reg;
842  }
843 
844  inline void nsLoad4ValAligned(const double* ptr)
845  {
846  ymm = _mm256_load_pd(ptr);
847  }
848 
849  static inline XMMReg4Double Load4Val(const float* ptr)
850  {
851  XMMReg4Double reg;
852  reg.nsLoad4Val(ptr);
853  return reg;
854  }
855 
856  inline void nsLoad4Val(const float* ptr)
857  {
858  ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
859  }
860 
861  static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
862  {
863  XMMReg4Double reg;
864  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
865  return reg;
866  }
867 
868  static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
869  {
870  XMMReg4Double reg;
871  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
872  return reg;
873  }
874 
875  static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
876  {
877  XMMReg4Double reg;
878  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
879  return reg;
880  }
881 
882  static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
883  {
884  XMMReg4Double reg;
885  reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
886  return reg;
887  }
888 
889  static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
890  {
891  XMMReg4Double reg;
892  reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
893  return reg;
894  }
895 
896  static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
897  {
898  XMMReg4Double reg;
899  reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
900  return reg;
901  }
902 
903  inline XMMReg4Double& operator= (const XMMReg4Double& other)
904  {
905  ymm = other.ymm;
906  return *this;
907  }
908 
909  inline XMMReg4Double& operator+= (const XMMReg4Double& other)
910  {
911  ymm = _mm256_add_pd(ymm, other.ymm);
912  return *this;
913  }
914 
915  inline XMMReg4Double& operator*= (const XMMReg4Double& other)
916  {
917  ymm = _mm256_mul_pd(ymm, other.ymm);
918  return *this;
919  }
920 
921  inline XMMReg4Double operator+ (const XMMReg4Double& other) const
922  {
923  XMMReg4Double ret;
924  ret.ymm = _mm256_add_pd(ymm, other.ymm);
925  return ret;
926  }
927 
928  inline XMMReg4Double operator- (const XMMReg4Double& other) const
929  {
930  XMMReg4Double ret;
931  ret.ymm = _mm256_sub_pd(ymm, other.ymm);
932  return ret;
933  }
934 
935  inline XMMReg4Double operator* (const XMMReg4Double& other) const
936  {
937  XMMReg4Double ret;
938  ret.ymm = _mm256_mul_pd(ymm, other.ymm);
939  return ret;
940  }
941 
942  inline XMMReg4Double operator/ (const XMMReg4Double& other) const
943  {
944  XMMReg4Double ret;
945  ret.ymm = _mm256_div_pd(ymm, other.ymm);
946  return ret;
947  }
948 
949  void AddToLow( const XMMReg2Double& other )
950  {
951  __m256d ymm2 = _mm256_setzero_pd();
952  ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
953  ymm = _mm256_add_pd(ymm, ymm2);
954  }
955 
956  inline double GetHorizSum() const
957  {
958  __m256d ymm_tmp1, ymm_tmp2;
959  ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
960  ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
961  ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
962  return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
963  }
964 
965  inline void Store4Val(unsigned char* ptr) const
966  {
967  __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
968  //xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
969  //xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
970  xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24))); // SSSE3
971  GDALCopyXMMToInt32(xmm_i, (GInt32*)ptr);
972  }
973 
974  inline void Store4Val(unsigned short* ptr) const
975  {
976  __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
977  xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
978  GDALCopyXMMToInt64(xmm_i, (GInt64*)ptr);
979  }
980 
981  inline void Store4Val(float* ptr) const
982  {
983  _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
984  }
985 
986  inline void Store4Val(double* ptr) const
987  {
988  _mm256_storeu_pd(ptr, ymm);
989  }
990 
991  inline void StoreMask(unsigned char* ptr) const
992  {
993  _mm256_storeu_si256( (__m256i*)ptr, _mm256_castpd_si256(ymm) );
994  }
995 };
996 
997 #else
998 
999 class XMMReg4Double
1000 {
1001  public:
1002  XMMReg2Double low, high;
1003 
1004  XMMReg4Double() {}
1005  XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
1006 
1007  static inline XMMReg4Double Zero()
1008  {
1009  XMMReg4Double reg;
1010  reg.low.Zeroize();
1011  reg.high.Zeroize();
1012  return reg;
1013  }
1014 
1015  static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
1016  {
1017  XMMReg4Double reg;
1018  reg.low.nsLoad1ValHighAndLow(ptr);
1019  reg.high = reg.low;
1020  return reg;
1021  }
1022 
1023  static inline XMMReg4Double Load4Val(const unsigned char* ptr)
1024  {
1025  XMMReg4Double reg;
1026  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1027  return reg;
1028  }
1029 
1030  static inline XMMReg4Double Load4Val(const short* ptr)
1031  {
1032  XMMReg4Double reg;
1033  reg.low.nsLoad2Val(ptr);
1034  reg.high.nsLoad2Val(ptr+2);
1035  return reg;
1036  }
1037 
1038  static inline XMMReg4Double Load4Val(const unsigned short* ptr)
1039  {
1040  XMMReg4Double reg;
1041  reg.low.nsLoad2Val(ptr);
1042  reg.high.nsLoad2Val(ptr+2);
1043  return reg;
1044  }
1045 
1046  static inline XMMReg4Double Load4Val(const double* ptr)
1047  {
1048  XMMReg4Double reg;
1049  reg.low.nsLoad2Val(ptr);
1050  reg.high.nsLoad2Val(ptr+2);
1051  return reg;
1052  }
1053 
1054  static inline XMMReg4Double Load4ValAligned(const double* ptr)
1055  {
1056  XMMReg4Double reg;
1057  reg.low.nsLoad2ValAligned(ptr);
1058  reg.high.nsLoad2ValAligned(ptr+2);
1059  return reg;
1060  }
1061 
1062  static inline XMMReg4Double Load4Val(const float* ptr)
1063  {
1064  XMMReg4Double reg;
1065  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1066  return reg;
1067  }
1068 
1069  static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1070  {
1071  XMMReg4Double reg;
1072  reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1073  reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1074  return reg;
1075  }
1076 
1077  static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1078  {
1079  XMMReg4Double reg;
1080  reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1081  reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1082  return reg;
1083  }
1084 
1085  static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1086  {
1087  XMMReg4Double reg;
1088  reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1089  reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1090  return reg;
1091  }
1092 
1093  static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1094  {
1095  XMMReg4Double reg;
1096  reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1097  reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1098  return reg;
1099  }
1100 
1101  static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
1102  {
1103  XMMReg4Double reg;
1104  reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1105  reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1106  return reg;
1107  }
1108 
1109  static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1110  {
1111  XMMReg4Double reg;
1112  reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1113  reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1114  return reg;
1115  }
1116 
1117  inline XMMReg4Double& operator= (const XMMReg4Double& other)
1118  {
1119  low = other.low;
1120  high = other.high;
1121  return *this;
1122  }
1123 
1124  inline XMMReg4Double& operator+= (const XMMReg4Double& other)
1125  {
1126  low += other.low;
1127  high += other.high;
1128  return *this;
1129  }
1130 
1131  inline XMMReg4Double& operator*= (const XMMReg4Double& other)
1132  {
1133  low *= other.low;
1134  high *= other.high;
1135  return *this;
1136  }
1137 
1138  inline XMMReg4Double operator+ (const XMMReg4Double& other) const
1139  {
1140  XMMReg4Double ret;
1141  ret.low = low + other.low;
1142  ret.high = high + other.high;
1143  return ret;
1144  }
1145 
1146  inline XMMReg4Double operator- (const XMMReg4Double& other) const
1147  {
1148  XMMReg4Double ret;
1149  ret.low = low - other.low;
1150  ret.high = high - other.high;
1151  return ret;
1152  }
1153 
1154  inline XMMReg4Double operator* (const XMMReg4Double& other) const
1155  {
1156  XMMReg4Double ret;
1157  ret.low = low * other.low;
1158  ret.high = high * other.high;
1159  return ret;
1160  }
1161 
1162  inline XMMReg4Double operator/ (const XMMReg4Double& other) const
1163  {
1164  XMMReg4Double ret;
1165  ret.low = low / other.low;
1166  ret.high = high / other.high;
1167  return ret;
1168  }
1169 
1170  void AddToLow( const XMMReg2Double& other )
1171  {
1172  low += other;
1173  }
1174 
1175  inline double GetHorizSum() const
1176  {
1177  return (low + high).GetHorizSum();
1178  }
1179 
1180  inline void Store4Val(unsigned char* ptr) const
1181  {
1182  low.Store2Val(ptr);
1183  high.Store2Val(ptr+2);
1184  }
1185 
1186  inline void Store4Val(unsigned short* ptr) const
1187  {
1188 #if 1
1189  low.Store2Val(ptr);
1190  high.Store2Val(ptr+2);
1191 #else
1192  __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1193  __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1194  xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1195 #if __SSE4_1__
1196  xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1197 #else
1198  xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1199  xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1200  xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1201 #endif
1202  GDALCopyXMMToInt64(xmm0, (GInt64*)ptr);
1203 #endif
1204  }
1205 
1206  inline void Store4Val(float* ptr) const
1207  {
1208  low.Store2Val(ptr);
1209  high.Store2Val(ptr+2);
1210  }
1211 
1212  inline void Store4Val(double* ptr) const
1213  {
1214  low.Store2Val(ptr);
1215  high.Store2Val(ptr+2);
1216  }
1217 
1218  inline void StoreMask(unsigned char* ptr) const
1219  {
1220  low.StoreMask(ptr);
1221  high.StoreMask(ptr+16);
1222  }
1223 
1224 };
1225 
1226 #endif
1227 
1228 #endif /* #ifndef DOXYGEN_SKIP */
1229 
1230 #endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
int GInt32
Int32 type.
Definition: cpl_port.h:197
short GInt16
Int16 type.
Definition: cpl_port.h:203
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:205
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:249

Generated for GDAL by doxygen 1.8.8.