10 #include "Hamming.hpp"
12 #include <aliceVision/numeric/Accumulator.hpp>
13 #include <aliceVision/config.hpp>
15 #if ALICEVISION_IS_DEFINED(ALICEVISION_HAVE_SSE)
16 #include <aliceVision/system/Logger.hpp>
17 #include <xmmintrin.h>
29 typedef T ElementType;
30 typedef typename Accumulator<T>::Type ResultType;
32 template<
typename Iterator1,
typename Iterator2>
33 inline ResultType operator()(Iterator1 a, Iterator2 b,
size_t size)
const
35 ResultType result = ResultType();
37 for (
size_t i = 0; i < size; ++i)
40 result += diff * diff;
50 typedef T ElementType;
51 typedef typename Accumulator<T>::Type ResultType;
53 template<
typename Iterator1,
typename Iterator2>
54 inline ResultType operator()(Iterator1 a, Iterator2 b,
size_t size)
const
56 ResultType result = ResultType();
57 ResultType diff0, diff1, diff2, diff3;
58 Iterator1 last = a + size;
59 Iterator1 lastgroup = last - 3;
68 result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
76 result += diff0 * diff0;
82 #if ALICEVISION_IS_DEFINED(ALICEVISION_HAVE_SSE)
87 union sseRegisterHelper
94 inline float l2_sse(
const float* b1,
const float* b2,
int size)
96 float* b1Pt = (
float*)b1;
97 float* b2Pt = (
float*)b2;
100 __m128 srcA, srcB, temp, cumSum;
101 float zeros[4] = {0.f, 0.f, 0.f, 0.f};
102 cumSum = _mm_load_ps(zeros);
103 for (
int i = 0; i < size; i += 4)
105 srcA = _mm_load_ps(b1Pt + i);
106 srcB = _mm_load_ps(b2Pt + i);
108 temp = _mm_sub_ps(srcA, srcB);
110 temp = _mm_mul_ps(temp, temp);
112 cumSum = _mm_add_ps(cumSum, temp);
114 sseRegisterHelper res;
116 return (res.f[0] + res.f[1] + res.f[2] + res.f[3]);
120 ALICEVISION_LOG_WARNING(
"/!\\ size is not modulus 4, distance cannot be performed in SSE");
129 struct L2_Vectorized<float>
131 typedef float ElementType;
132 typedef Accumulator<float>::Type ResultType;
134 template<
typename Iterator1,
typename Iterator2>
135 inline ResultType operator()(Iterator1 a, Iterator2 b,
size_t size)
const
137 return optim_ss2::l2_sse(a, b, size);
141 #endif // ALICEVISION_HAVE_SSE