Skip to content

Commit

Permalink
Merge pull request #11239 from rouault/alg_neon
Browse files Browse the repository at this point in the history
Use SSE2 optimizations on ARM Neon for warping, pansharpening, gridding, dithering, RPC, PNG, GTI
  • Loading branch information
rouault authored Nov 25, 2024
2 parents c7606da + 0642738 commit 9bc4894
Show file tree
Hide file tree
Showing 19 changed files with 111 additions and 46 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/benchmarks/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ BENCHMARK_OPTIONS=(
# Run target build and compare its results to the reference one.
# Fail if we get results 20% slower or more.
# Retry if that fails a first time.
# dist=no is needed because pytest-benchmark doesn't like other values of dist
# and in conftest.py/pytest.ini we set by default --dist=loadgroup
BENCHMARK_COMPARE_OPTIONS=(
"--dist=no" \
"--benchmark-compare-fail=min:20%" \
"--benchmark-compare=0001_ref" \
)
Expand Down
5 changes: 5 additions & 0 deletions alg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ elseif (GDAL_USE_QHULL)
gdal_target_link_libraries(alg PRIVATE ${QHULL_LIBRARY})
endif ()

if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
target_compile_definitions(alg PRIVATE -DHAVE_SSE_AT_COMPILE_TIME -DUSE_NEON_OPTIMIZATIONS)
target_sources(alg PRIVATE gdalgridsse.cpp)
endif()

if (HAVE_SSE_AT_COMPILE_TIME)
target_sources(alg PRIVATE gdalgridsse.cpp)
target_compile_definitions(alg PRIVATE -DHAVE_SSE_AT_COMPILE_TIME)
Expand Down
12 changes: 10 additions & 2 deletions alg/gdal_rpc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,18 @@
#include "gdal_mdreader.h"
#include "gdal_alg_priv.h"
#include "gdal_priv.h"
#if defined(__x86_64) || defined(_M_X64)
#define USE_SSE2_OPTIM

#ifdef USE_NEON_OPTIMIZATIONS
#define USE_SSE2
#elif defined(__x86_64) || defined(_M_X64)
#define USE_SSE2
#endif

#ifdef USE_SSE2
#include "gdalsse_priv.h"
#define USE_SSE2_OPTIM
#endif

#include "ogr_api.h"
#include "ogr_geometry.h"
#include "ogr_spatialref.h"
Expand Down
7 changes: 5 additions & 2 deletions alg/gdaldither.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,16 @@
#include "gdal.h"
#include "gdal_priv.h"

#if defined(__x86_64) || defined(_M_X64)
#ifdef USE_NEON_OPTIMIZATIONS
#define USE_SSE2
#include "include_sse2neon.h"
#elif defined(__x86_64) || defined(_M_X64)
#define USE_SSE2
#include <emmintrin.h>
#endif

#ifdef USE_SSE2

#include <emmintrin.h>
#define CAST_PCT(x) reinterpret_cast<GByte *>(x)
#define ALIGN_INT_ARRAY_ON_16_BYTE(x) \
(((reinterpret_cast<GUIntptr_t>(x) % 16) != 0) \
Expand Down
8 changes: 5 additions & 3 deletions alg/gdalgrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2842,9 +2842,11 @@ GDALGridContext *GDALGridContextCreate(GDALGridAlgorithm eAlgorithm,
#ifdef HAVE_SSE_AT_COMPILE_TIME

if (pafXAligned == nullptr &&
CPLTestBool(
CPLGetConfigOption("GDAL_USE_SSE", "YES")) &&
CPLHaveRuntimeSSE())
CPLTestBool(CPLGetConfigOption("GDAL_USE_SSE", "YES"))
#if !defined(USE_NEON_OPTIMIZATIONS)
&& CPLHaveRuntimeSSE()
#endif
)
{
pafXAligned = static_cast<float *>(
VSI_MALLOC_ALIGNED_AUTO_VERBOSE(sizeof(float) *
Expand Down
7 changes: 6 additions & 1 deletion alg/gdalgridsse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
#include "gdalgrid_priv.h"

#ifdef HAVE_SSE_AT_COMPILE_TIME

#ifdef USE_NEON_OPTIMIZATIONS
#include "include_sse2neon.h"
#else
#include <xmmintrin.h>
#endif

/************************************************************************/
/* GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE() */
Expand Down Expand Up @@ -44,7 +49,7 @@ CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE(
__m128 xmm_denominator = _mm_setzero_ps();
int mask = 0;

#if defined(__x86_64) || defined(_M_X64)
#if defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS)
// This would also work in 32bit mode, but there are only 8 XMM registers
// whereas we have 16 for 64bit.
const size_t LOOP_SIZE = 8;
Expand Down
3 changes: 2 additions & 1 deletion alg/gdalpansharpen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,8 +650,9 @@ void GDALPansharpenOperation::WeightedBrovey3(

/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
/* Could possibly be used too on 32bit, but we would need to check at runtime */
#if defined(__x86_64) || defined(_M_X64)
#if defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS)

#define USE_SSE2
#include "gdalsse_priv.h"

template <class T, int NINPUT, int NOUTPUT>
Expand Down
40 changes: 24 additions & 16 deletions alg/gdalwarpkernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,17 @@
#include "ogr_geos.h"
#endif

#ifdef USE_NEON_OPTIMIZATIONS
#include "include_sse2neon.h"
#define USE_SSE2

#include "gdalsse_priv.h"

// We restrict to 64bit processors because they are guaranteed to have SSE2.
// Could possibly be used too on 32bit, but we would need to check at runtime.
#if defined(__x86_64) || defined(_M_X64)
#elif defined(__x86_64) || defined(_M_X64)
#define USE_SSE2

#include "gdalsse_priv.h"

#if __SSE4_1__
Expand Down Expand Up @@ -2971,7 +2979,7 @@ static bool GWKCubicResample4Sample(const GDALWarpKernel *poWK, int iBand,
return true;
}

#if defined(__x86_64) || defined(_M_X64)
#ifdef USE_SSE2

/************************************************************************/
/* XMMLoad4Values() */
Expand All @@ -2987,7 +2995,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
__m128i xmm_i = _mm_cvtsi32_si128(i);
// Zero extend 4 packed unsigned 8-bit integers in a to packed
// 32-bit integers.
#if __SSE4_1__
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
xmm_i = _mm_cvtepu8_epi32(xmm_i);
#else
xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
Expand All @@ -3003,7 +3011,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
__m128i xmm_i = _mm_cvtsi64_si128(i);
// Zero extend 4 packed unsigned 16-bit integers in a to packed
// 32-bit integers.
#if __SSE4_1__
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
xmm_i = _mm_cvtepu16_epi32(xmm_i);
#else
xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
Expand All @@ -3017,7 +3025,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
/* Return the sum of the 4 floating points of the register. */
/************************************************************************/

#if __SSE3__
#if defined(__SSE3__) || defined(USE_NEON_OPTIMIZATIONS)
static CPL_INLINE float XMMHorizontalAdd(__m128 v)
{
__m128 shuf = _mm_movehdup_ps(v); // (v3 , v3 , v1 , v1)
Expand All @@ -3037,7 +3045,7 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v)
}
#endif

#endif // (defined(__x86_64) || defined(_M_X64))
#endif // define USE_SSE2

/************************************************************************/
/* GWKCubicResampleSrcMaskIsDensity4SampleRealT() */
Expand Down Expand Up @@ -3067,7 +3075,7 @@ static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT(
pdfDensity, pdfReal, adfImagIgnored);
}

#if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
#if defined(USE_SSE_CUBIC_IMPL) && defined(USE_SSE2)
const float fDeltaX = static_cast<float>(dfSrcX) - 0.5f - iSrcX;
const float fDeltaY = static_cast<float>(dfSrcY) - 0.5f - iSrcY;

Expand Down Expand Up @@ -3137,7 +3145,7 @@ static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT(
if (fabs(*pdfReal - static_cast<int>(*pdfReal) - 0.5) > .007)
return true;

#endif // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
#endif // defined(USE_SSE_CUBIC_IMPL) && defined(USE_SSE2)

const double dfDeltaX = dfSrcX - 0.5 - iSrcX;
const double dfDeltaY = dfSrcY - 0.5 - iSrcY;
Expand All @@ -3154,7 +3162,7 @@ static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT(
for (GPtrDiff_t i = -1; i < 3; i++)
{
const GPtrDiff_t iOffset = iSrcOffset + i * poWK->nSrcXSize - 1;
#if !(defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64)))
#if !(defined(USE_SSE_CUBIC_IMPL) && defined(USE_SSE2))
if (poWK->pafUnifiedSrcDensity[iOffset + 0] < SRC_DENSITY_THRESHOLD ||
poWK->pafUnifiedSrcDensity[iOffset + 1] < SRC_DENSITY_THRESHOLD ||
poWK->pafUnifiedSrcDensity[iOffset + 2] < SRC_DENSITY_THRESHOLD ||
Expand Down Expand Up @@ -4167,7 +4175,7 @@ static bool GWKResampleOptimizedLanczos(const GDALWarpKernel *poWK, int iBand,
reinterpret_cast<const GByte *>(poWK->papabySrcImage[iBand]);
pSrc += iSrcOffset + static_cast<GPtrDiff_t>(jMin) * nSrcXSize;

#if defined(__x86_64) || defined(_M_X64)
#if defined(USE_SSE2)
if (iMax - iMin + 1 == 6)
{
// This is just an optimized version of the general case in
Expand Down Expand Up @@ -4530,7 +4538,7 @@ GWKResampleNoMasksT(const GDALWarpKernel *poWK, int iBand, double dfSrcX,

/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
/* Could possibly be used too on 32bit, but we would need to check at runtime */
#if defined(__x86_64) || defined(_M_X64)
#if defined(USE_SSE2)

/************************************************************************/
/* GWKResampleNoMasks_SSE2_T() */
Expand Down Expand Up @@ -4800,7 +4808,7 @@ bool GWKResampleNoMasksT<double>(const GDALWarpKernel *poWK, int iBand,

#endif /* INSTANTIATE_FLOAT64_SSE2_IMPL */

#endif /* defined(__x86_64) || defined(_M_X64) */
#endif /* defined(USE_SSE2) */

/************************************************************************/
/* GWKRoundSourceCoordinates() */
Expand Down Expand Up @@ -6120,7 +6128,7 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK)

/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
/* and enough SSE registries */
#if defined(__x86_64) || defined(_M_X64)
#if defined(USE_SSE2)

static inline float Convolute4x4(const __m128 row0, const __m128 row1,
const __m128 row2, const __m128 row3,
Expand Down Expand Up @@ -6247,7 +6255,7 @@ static void GWKCubicResampleNoMasks4MultiBandT(const GDALWarpKernel *poWK,
poWK->pafDstDensity[iDstOffset] = 1.0f;
}

#endif // defined(__x86_64) || defined(_M_X64)
#endif // defined(USE_SSE2)

/************************************************************************/
/* GWKResampleNoMasksOrDstDensityOnlyThreadInternal() */
Expand Down Expand Up @@ -6353,7 +6361,7 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
const GPtrDiff_t iDstOffset =
iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;

#if defined(__x86_64) || defined(_M_X64)
#if defined(USE_SSE2)
if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic &&
(std::is_same<T, GByte>::value ||
std::is_same<T, GUInt16>::value))
Expand All @@ -6367,7 +6375,7 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
continue;
}
}
#endif // defined(__x86_64) || defined(_M_X64)
#endif // defined(USE_SSE2)

[[maybe_unused]] double dfInvWeights = 0;
for (int iBand = 0; iBand < poWK->nBands; iBand++)
Expand Down
11 changes: 10 additions & 1 deletion ci/travis/osx/script.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/sh
#!/bin/bash

set -e

Expand Down Expand Up @@ -27,3 +27,12 @@ DYLD_LIBRARY_PATH=$PWD/build PYTHONPATH=$PWD/build/swig/python python3 -c "from

# Run all the Python autotests
(cd build && ctest -V -R autotest -j${NPROC})

# Use time.process_time for more reliability on VMs
# dist=no is needed because pytest-benchmark doesn't like other values of dist
# and in conftest.py/pytest.ini we set by default --dist=loadgroup
BENCHMARK_OPTIONS=(
"--dist=no" \
"--benchmark-timer=time.process_time" \
)
(cd build && python3 -m pytest autotest/benchmark "${BENCHMARK_OPTIONS[@]}" --capture=no -ra -vv)
4 changes: 4 additions & 0 deletions frmts/gti/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,7 @@ endif ()
if (GDAL_ENABLE_DRIVER_GTI_PLUGIN)
target_compile_definitions(gdal_GTI PRIVATE -DBUILT_AS_PLUGIN)
endif()

if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
target_compile_definitions(gdal_GTI PRIVATE -DUSE_NEON_OPTIMIZATIONS)
endif()
6 changes: 5 additions & 1 deletion frmts/gti/gdaltileindexdataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@
#include "gdal_thread_pool.h"
#include "gdal_utils.h"

#if defined(__SSE2__) || defined(_M_X64)
#ifdef USE_NEON_OPTIMIZATIONS
#define USE_SSE2_OPTIM
#define USE_SSE41_OPTIM
#include "include_sse2neon.h"
#elif defined(__SSE2__) || defined(_M_X64)
#define USE_SSE2_OPTIM
#include <emmintrin.h>
// MSVC doesn't define __SSE4_1__, but if -arch:AVX2 is enabled, we do have SSE4.1
Expand Down
4 changes: 4 additions & 0 deletions frmts/png/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,7 @@ if (GDAL_USE_ZLIB_INTERNAL)
else ()
gdal_target_link_libraries(gdal_PNG PRIVATE ZLIB::ZLIB)
endif ()

if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
target_compile_definitions(gdal_PNG PRIVATE -DUSE_NEON_OPTIMIZATIONS)
endif()
8 changes: 7 additions & 1 deletion frmts/png/filter_sse2_intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
#endif

#ifndef PNG_INTEL_SSE_IMPLEMENTATION
#if defined(__SSE4_1__) || defined(__AVX__)
#if defined(USE_NEON_OPTIMIZATIONS)
#define PNG_INTEL_SSE_IMPLEMENTATION 3
#elif defined(__SSE4_1__) || defined(__AVX__)
/* We are not actually using AVX, but checking for AVX is the best
way we can detect SSE4.1 and SSSE3 on MSVC.
*/
Expand All @@ -30,7 +32,11 @@
#endif
#endif

#if defined(USE_NEON_OPTIMIZATIONS)
#include "include_sse2neon.h"
#else
#include <immintrin.h>
#endif

/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
* They're positioned like this:
Expand Down
6 changes: 3 additions & 3 deletions frmts/png/pngdataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ PNGDataset::~PNGDataset()
#include "filter_sse2_intrinsics.c"
#endif

#if defined(__GNUC__) && !defined(__SSE2__)
#if defined(__GNUC__) && !defined(__SSE2__) && !defined(USE_NEON_OPTIMIZATIONS)
__attribute__((optimize("tree-vectorize"))) static inline void
AddVectors(const GByte *CPL_RESTRICT pabyInputLine,
GByte *CPL_RESTRICT pabyOutputLine, int nSize)
Expand Down Expand Up @@ -677,7 +677,7 @@ CPLErr PNGDataset::LoadWholeImage(void *pSingleBuffer, GSpacing nPixelSpace,
const GByte *CPL_RESTRICT pabyOutputLineUp =
pabyOutputBuffer +
(static_cast<size_t>(iY) - 1) * nSamplesPerLine;
#if defined(__GNUC__) && !defined(__SSE2__)
#if defined(__GNUC__) && !defined(__SSE2__) && !defined(USE_NEON_OPTIMIZATIONS)
AddVectors(pabyInputLine, pabyOutputLineUp, pabyOutputLine,
nSamplesPerLine);
#else
Expand Down Expand Up @@ -707,7 +707,7 @@ CPLErr PNGDataset::LoadWholeImage(void *pSingleBuffer, GSpacing nPixelSpace,
}
else
{
#if defined(__GNUC__) && !defined(__SSE2__)
#if defined(__GNUC__) && !defined(__SSE2__) && !defined(USE_NEON_OPTIMIZATIONS)
AddVectors(pabyInputLine, pabyOutputLine, nSamplesPerLine);
#else
int iX;
Expand Down
5 changes: 4 additions & 1 deletion frmts/png/pngdataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@
#pragma warning(disable : 4611)
#endif

#if defined(__SSE2__) || defined(_M_X64) || \
#ifdef USE_NEON_OPTIMIZATIONS
#define HAVE_SSE2
#include "include_sse2neon.h"
#elif defined(__SSE2__) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define HAVE_SSE2
#include <emmintrin.h>
Expand Down
Loading

0 comments on commit 9bc4894

Please sign in to comment.