GalSim-developers · rmjarvis · May 5, 2023 · Feb 18, 2022 · Mar 3, 2022 · Mar 18, 2022
diff --git a/include/galsim/Laguerre.h b/include/galsim/Laguerre.h
@@ -28,6 +28,10 @@
 #if defined(__GNUC__) && __GNUC__ >= 6
 #pragma GCC diagnostic ignored "-Wint-in-bool-context"
 #endif
+
+// Clang incorrectly defines __CUDA_ARCH__ in host code when building for
+// OpenMP target offload, so we have to undefine it or Eigen gets confused
+#undef __CUDA_ARCH__
 #include "Eigen/Dense"
 using Eigen::VectorXd;
 using Eigen::MatrixXd;

diff --git a/include/galsim/PhotonArray.h b/include/galsim/PhotonArray.h
@@ -90,6 +90,12 @@ namespace galsim {
         double* getDXDZArray() { return _dxdz; }
         double* getDYDZArray() { return _dydz; }
         double* getWavelengthArray() { return _wave; }
+        const double* getXArray() const { return _x; }
+        const double* getYArray() const { return _y; }
+        const double* getFluxArray() const { return _flux; }
+        const double* getDXDZArray() const { return _dxdz; }
+        const double* getDYDZArray() const { return _dydz; }
+        const double* getWavelengthArray() const { return _wave; }
         bool hasAllocatedAngles() const { return _dxdz != 0 && _dydz != 0; }
         bool hasAllocatedWavelengths() const { return _wave != 0; }
         /**

diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h
@@ -34,20 +34,25 @@
 
 namespace galsim
 {
-
     class PUBLIC_API Silicon
     {
     public:
         Silicon(int numVertices, double numElec, int nx, int ny, int qDist,
                 double diffStep, double pixelSize, double sensorThickness, double* vertex_data,
                 const Table& tr_radial_table, Position<double> treeRingCenter,
                 const Table& abs_length_table, bool transpose);
-
-        template <typename T>
-        bool insidePixel(int ix, int iy, double x, double y, double zconv,
-                         ImageView<T> target, bool* off_edge=0) const;
-
-        void scaleBoundsToPoly(int i, int j, int nx, int ny,
+        ~Silicon();
+
+	bool insidePixel(int ix, int iy, double x, double y, double zconv,
+                         Bounds<int>& targetBounds, bool* off_edge,
+                         int emptypolySize,
+                         Bounds<double>* pixelInnerBoundsData,
+                         Bounds<double>* pixelOuterBoundsData,
+                         Position<float>* horizontalBoundaryPointsData,
+                         Position<float>* verticalBoundaryPointsData,
+                         Position<double>* emptypolyData) const;
+
+	void scaleBoundsToPoly(int i, int j, int nx, int ny,
                                const Polygon& emptypoly, Polygon& result,
                                double factor) const;
 
@@ -72,6 +77,8 @@ namespace galsim
         template <typename T>
         void initialize(ImageView<T> target, Position<int> orig_center);
 
+        void finalize();
+
         template <typename T>
         double accumulate(const PhotonArray& photons, int i1, int i2,
                           BaseDeviate rng, ImageView<T> target);
@@ -249,6 +256,12 @@ namespace galsim
 
         void updatePixelBounds(int nx, int ny, size_t k);
 
+        void updatePixelBoundsGPU(int nx, int ny, size_t k,
+                                  Bounds<double>* pixelInnerBoundsData,
+                                  Bounds<double>* pixelOuterBoundsData,
+                                  Position<float>* horizontalBoundaryPointsData,
+                                  Position<float>* verticalBoundaryPointsData);
+
         Polygon _emptypoly;
         mutable std::vector<Polygon> _testpoly;
 
@@ -265,6 +278,19 @@ namespace galsim
         Table _abs_length_table;
         bool _transpose;
         ImageAlloc<double> _delta;
+        std::shared_ptr<bool> _changed;
+
+	// GPU data
+        std::vector<double> _abs_length_table_GPU;
+        std::vector<Position<double> > _emptypolyGPU;
+        double _abs_length_arg_min, _abs_length_arg_max;
+        double _abs_length_increment;
+        int _abs_length_size;
+
+        // need to keep a pointer to the last target image's data and its data type
+        // so we can release it on the GPU later
+        void* _targetData;
+        bool _targetIsDouble;
     };
 
     PUBLIC_API int SetOMPThreads(int num_threads);

diff --git a/setup.py b/setup.py
@@ -81,6 +81,9 @@ def all_files_from(dir, ext=''):
                                 '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'],
     'clang w/ manual OpenMP' : ['-O2','-std=c++11','-Xpreprocessor','-fopenmp',
                                 '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'],
+    'clang w/ GPU' : ['-O2','-msse2','-std=c++11','-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda',
+                      '-Wno-openmp-mapping','-Wno-unknown-cuda-version',
+                      '-Wno-shorten-64-to-32','-fvisibility=hidden', '-DGALSIM_USE_GPU'],
     'unknown' : [],
 }
 lopt =  {
@@ -90,6 +93,8 @@ def all_files_from(dir, ext=''):
     'clang w/ OpenMP' : ['-stdlib=libc++','-fopenmp'],
     'clang w/ Intel OpenMP' : ['-stdlib=libc++','-liomp5'],
     'clang w/ manual OpenMP' : ['-stdlib=libc++','-lomp'],
+    'clang w/ GPU' : ['-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda',
+                      '-Wno-openmp-mapping','-Wno-unknown-cuda-version'],
     'unknown' : [],
 }
 
@@ -143,7 +148,11 @@ def get_compiler_type(compiler, check_unknown=True, output=False):
         # with the openmp flag and see if it works.
         if output:
             print('Compiler is Clang.  Checking if it is a version that supports OpenMP.')
-        if try_openmp(compiler, 'clang w/ OpenMP'):
+        if supports_gpu(compiler):
+            if output:
+                print("Yay! This version of clang supports GPU!")
+            return 'clang w/ GPU'
+        elif try_openmp(compiler, 'clang w/ OpenMP'):
             if output:
                 print("Yay! This version of clang supports OpenMP!")
             return 'clang w/ OpenMP'
@@ -193,6 +202,23 @@ def get_compiler_type(compiler, check_unknown=True, output=False):
     else:
         return 'unknown'
 
+# Check whether this build of Clang supports offloading to GPU via OpenMP
+def supports_gpu(compiler):
+    # Print out compiler's targets
+    cc = compiler.compiler_so[0]
+    if cc == 'ccache':
+        cc = compiler.compiler_so[1]
+    cmd = [cc,'-print-targets']
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    lines = p.stdout.readlines()
+    # Look for 'nvptx' in the output. May need a more general check in future to support
+    # other GPU architectures
+    for line in lines:
+        line = line.decode().strip()
+        if 'nvptx' in line:
+            return True
+    return False
+
 # Check for the fftw3 library in some likely places
 def find_fftw_lib(output=False):
     import distutils.sysconfig
@@ -435,7 +461,7 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa
         cpp_name = cpp_file.name
 
     # Just get a named temporary file to write to:
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.os', dir=local_tmp) as o_file:
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.o', dir=local_tmp) as o_file:
         o_name = o_file.name
 
     # Another named temporary file for the executable
@@ -486,7 +512,8 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa
             print('Trying link command:')
             print(' '.join(cmd))
             print('Output was:')
-            print('   ',b'   '.join(lines).decode())
+            #print('   ',b'   '.join(lines).decode())
+            print('   ',b'   '.join(lines))
         returncode = p.returncode
     except OSError as e:
         if debug:
@@ -538,7 +565,8 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa
                 print('Trying link command:')
                 print(' '.join(cmd))
                 print('Output was:')
-                print('   ',b'   '.join(lines).decode())
+                #print('   ',b'   '.join(lines).decode())
+                print('   ',b'   '.join(lines))
             returncode = p.returncode
         except OSError as e:
             if debug:
@@ -823,8 +851,6 @@ def add_dirs(builder, output=False):
     if hasattr(builder, 'library_dirs'):
         if fftw_libpath != '':
             builder.library_dirs.append(fftw_libpath)
-        builder.libraries.append('galsim')  # Make sure galsim comes before fftw3
-        builder.libraries.append(os.path.split(fftw_lib)[1].split('.')[0][3:])
     fftw_include = os.path.join(os.path.split(fftw_libpath)[0], 'include')
     if os.path.isfile(os.path.join(fftw_include, 'fftw3.h')):
         print('Include directory for fftw3 is ',fftw_include)
@@ -1276,7 +1302,8 @@ def run_tests(self):
 ext=Extension("galsim._galsim",
               py_sources,
               depends = cpp_sources + headers + inst,
-              undef_macros = undef_macros)
+              undef_macros = undef_macros,
+              extra_link_args = ["-lfftw3"])
 
 build_dep = ['setuptools>=38', 'pybind11>=2.2', 'numpy>=1.17']
 run_dep = ['astropy', 'LSSTDESC.Coord']

diff --git a/src/RealGalaxy.cpp b/src/RealGalaxy.cpp
@@ -20,6 +20,10 @@
 #if defined(__GNUC__) && __GNUC__ >= 6
 #pragma GCC diagnostic ignored "-Wint-in-bool-context"
 #endif
+
+// Clang incorrectly defines __CUDA_ARCH__ in host code when building for
+// OpenMP target offload, so we have to undefine it or Eigen gets confused
+#undef __CUDA_ARCH__
 #include "Eigen/Dense"
 
 #include "RealGalaxy.h"