diff --git a/.azure-pipelines/azure-pipelines-linux.yml b/.azure-pipelines/azure-pipelines-linux.yml
index be1ea3f..6aedfe0 100755
--- a/.azure-pipelines/azure-pipelines-linux.yml
+++ b/.azure-pipelines/azure-pipelines-linux.yml
@@ -20,6 +20,10 @@ jobs:
CONFIG: linux_64_c_compiler_version12cuda_compilerNonecuda_compiler_versionNonecxx_compiler_version12
UPLOAD_PACKAGES: 'True'
DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cos7-x86_64
+ linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12:
+ CONFIG: linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12
+ UPLOAD_PACKAGES: 'True'
+ DOCKER_IMAGE: quay.io/condaforge/linux-anvil-cos7-x86_64
linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7:
CONFIG: linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7
UPLOAD_PACKAGES: 'True'
diff --git a/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml b/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml
new file mode 100644
index 0000000..eef5d92
--- /dev/null
+++ b/.ci_support/linux_64_c_compiler_version12cuda_compilercuda-nvcccuda_compiler_version12.0cxx_compiler_version12.yaml
@@ -0,0 +1,66 @@
+boost_cpp:
+- 1.78.0
+c_compiler:
+- gcc
+c_compiler_version:
+- '12'
+cdt_name:
+- cos7
+channel_sources:
+- conda-forge
+channel_targets:
+- conda-forge main
+cuda_compiler:
+- cuda-nvcc
+cuda_compiler_version:
+- '12.0'
+cxx_compiler:
+- gxx
+cxx_compiler_version:
+- '12'
+docker_image:
+- quay.io/condaforge/linux-anvil-cos7-x86_64
+flann:
+- 1.9.1
+gflags:
+- '2.2'
+glew:
+- '2.1'
+glog:
+- '0.6'
+gmp:
+- '6'
+libblas:
+- 3.9 *netlib
+libcblas:
+- 3.9 *netlib
+libxcb:
+- '1.15'
+lz4_c:
+- 1.9.3
+metis:
+- '5.1'
+pin_run_as_build:
+ boost-cpp:
+ max_pin: x.x.x
+ flann:
+ max_pin: x.x.x
+ vlfeat:
+ max_pin: x.x.x
+qt_main:
+- '5.15'
+sqlite:
+- '3'
+suitesparse:
+- '5'
+target_platform:
+- linux-64
+vlfeat:
+- 0.9.21
+zip_keys:
+- - c_compiler_version
+ - cxx_compiler_version
+ - cuda_compiler
+ - cuda_compiler_version
+ - cdt_name
+ - docker_image
diff --git a/.ci_support/migrations/cuda120.yaml b/.ci_support/migrations/cuda120.yaml
new file mode 100644
index 0000000..25f0f88
--- /dev/null
+++ b/.ci_support/migrations/cuda120.yaml
@@ -0,0 +1,72 @@
+migrator_ts: 1682985063
+__migrator:
+ kind:
+ version
+ migration_number:
+ 1
+ build_number:
+ 1
+ paused: false
+ override_cbc_keys:
+ - cuda_compiler_stub
+ operation: key_add
+ check_solvable: false
+ primary_key: cuda_compiler_version
+ ordering:
+ cxx_compiler_version:
+ - 9
+ - 8
+ - 7
+ c_compiler_version:
+ - 9
+ - 8
+ - 7
+ fortran_compiler_version:
+ - 9
+ - 8
+ - 7
+ docker_image:
+ - quay.io/condaforge/linux-anvil-comp7 # [os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ - quay.io/condaforge/linux-anvil-aarch64 # [os.environ.get("BUILD_PLATFORM") == "linux-aarch64"]
+ - quay.io/condaforge/linux-anvil-ppc64le # [os.environ.get("BUILD_PLATFORM") == "linux-ppc64le"]
+ - quay.io/condaforge/linux-anvil-armv7l # [os.environ.get("BUILD_PLATFORM") == "linux-armv7l"]
+ - quay.io/condaforge/linux-anvil-cuda:9.2 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ - quay.io/condaforge/linux-anvil-cuda:10.0 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ - quay.io/condaforge/linux-anvil-cuda:10.1 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ - quay.io/condaforge/linux-anvil-cuda:10.2 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ - quay.io/condaforge/linux-anvil-cuda:11.0 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ - quay.io/condaforge/linux-anvil-cuda:11.1 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ - quay.io/condaforge/linux-anvil-cuda:11.2 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ - quay.io/condaforge/linux-anvil-cos7-x86_64 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64"]
+ cuda_compiler_version:
+ - None
+ - 10.2 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 11.0 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 11.1 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 11.2 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 12.0 # [(linux64 or win) and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ commit_message: "Rebuild for CUDA 12"
+
+cuda_compiler: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - cuda-nvcc # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+cuda_compiler_version: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 12.0 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+c_compiler_version: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 12 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+cxx_compiler_version: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 12 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+fortran_compiler_version: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 12 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+cudnn: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - 8 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+cdt_name: # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - cos7 # [linux64 and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+
+docker_image: # [os.environ.get("BUILD_PLATFORM", "").startswith("linux-") and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
+ - quay.io/condaforge/linux-anvil-cos7-x86_64 # [linux64 and os.environ.get("BUILD_PLATFORM") == "linux-64" and os.environ.get("CF_CUDA_ENABLED", "False") == "True"]
diff --git a/README.md b/README.md
index dc83bb8..4ab6a92 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,13 @@ Current build status
+
linux_64_c_compiler_version7cuda_compilernvcccuda_compiler_version10.2cxx_compiler_version7 |
diff --git a/recipe/1809.patch b/recipe/1809.patch
new file mode 100644
index 0000000..0958c97
--- /dev/null
+++ b/recipe/1809.patch
@@ -0,0 +1,1125 @@
+diff --git a/src/mvs/cuda_array_wrapper.h b/src/mvs/cuda_array_wrapper.h
+deleted file mode 100644
+index e4e48b0e8..000000000
+--- a/src/mvs/cuda_array_wrapper.h
++++ /dev/null
+@@ -1,171 +0,0 @@
+-// Copyright (c) 2023, ETH Zurich and UNC Chapel Hill.
+-// All rights reserved.
+-//
+-// Redistribution and use in source and binary forms, with or without
+-// modification, are permitted provided that the following conditions are met:
+-//
+-// * Redistributions of source code must retain the above copyright
+-// notice, this list of conditions and the following disclaimer.
+-//
+-// * Redistributions in binary form must reproduce the above copyright
+-// notice, this list of conditions and the following disclaimer in the
+-// documentation and/or other materials provided with the distribution.
+-//
+-// * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
+-// its contributors may be used to endorse or promote products derived
+-// from this software without specific prior written permission.
+-//
+-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+-// POSSIBILITY OF SUCH DAMAGE.
+-//
+-// Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
+-
+-#ifndef COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_
+-#define COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_
+-
+-#include
+-
+-#include
+-
+-#include "mvs/gpu_mat.h"
+-#include "util/cudacc.h"
+-
+-namespace colmap {
+-namespace mvs {
+-
+-template
+-class CudaArrayWrapper {
+- public:
+- CudaArrayWrapper(const size_t width, const size_t height, const size_t depth);
+- ~CudaArrayWrapper();
+-
+- const cudaArray* GetPtr() const;
+- cudaArray* GetPtr();
+-
+- size_t GetWidth() const;
+- size_t GetHeight() const;
+- size_t GetDepth() const;
+-
+- void CopyToDevice(const T* data);
+- void CopyToHost(const T* data);
+- void CopyFromGpuMat(const GpuMat& array);
+-
+- private:
+- // Define class as non-copyable and non-movable.
+- CudaArrayWrapper(CudaArrayWrapper const&) = delete;
+- void operator=(CudaArrayWrapper const& obj) = delete;
+- CudaArrayWrapper(CudaArrayWrapper&&) = delete;
+-
+- void Allocate();
+- void Deallocate();
+-
+- cudaArray* array_;
+-
+- size_t width_;
+- size_t height_;
+- size_t depth_;
+-};
+-
+-////////////////////////////////////////////////////////////////////////////////
+-// Implementation
+-////////////////////////////////////////////////////////////////////////////////
+-
+-template
+-CudaArrayWrapper::CudaArrayWrapper(const size_t width, const size_t height,
+- const size_t depth)
+- : width_(width), height_(height), depth_(depth), array_(nullptr) {}
+-
+-template
+-CudaArrayWrapper::~CudaArrayWrapper() {
+- Deallocate();
+-}
+-
+-template
+-const cudaArray* CudaArrayWrapper::GetPtr() const {
+- return array_;
+-}
+-
+-template
+-cudaArray* CudaArrayWrapper::GetPtr() {
+- return array_;
+-}
+-
+-template
+-size_t CudaArrayWrapper::GetWidth() const {
+- return width_;
+-}
+-
+-template
+-size_t CudaArrayWrapper::GetHeight() const {
+- return height_;
+-}
+-
+-template
+-size_t CudaArrayWrapper::GetDepth() const {
+- return depth_;
+-}
+-
+-template
+-void CudaArrayWrapper::CopyToDevice(const T* data) {
+- cudaMemcpy3DParms params = {0};
+- Allocate();
+- params.extent = make_cudaExtent(width_, height_, depth_);
+- params.kind = cudaMemcpyHostToDevice;
+- params.dstArray = array_;
+- params.srcPtr =
+- make_cudaPitchedPtr((void*)data, width_ * sizeof(T), width_, height_);
+- CUDA_SAFE_CALL(cudaMemcpy3D(¶ms));
+-}
+-
+-template
+-void CudaArrayWrapper::CopyToHost(const T* data) {
+- cudaMemcpy3DParms params = {0};
+- params.extent = make_cudaExtent(width_, height_, depth_);
+- params.kind = cudaMemcpyDeviceToHost;
+- params.dstPtr =
+- make_cudaPitchedPtr((void*)data, width_ * sizeof(T), width_, height_);
+- params.srcArray = array_;
+- CUDA_SAFE_CALL(cudaMemcpy3D(¶ms));
+-}
+-
+-template
+-void CudaArrayWrapper::CopyFromGpuMat(const GpuMat& array) {
+- Allocate();
+- cudaMemcpy3DParms parameters = {0};
+- parameters.extent = make_cudaExtent(width_, height_, depth_);
+- parameters.kind = cudaMemcpyDeviceToDevice;
+- parameters.dstArray = array_;
+- parameters.srcPtr = make_cudaPitchedPtr((void*)array.GetPtr(),
+- array.GetPitch(), width_, height_);
+- CUDA_SAFE_CALL(cudaMemcpy3D(¶meters));
+-}
+-
+-template
+-void CudaArrayWrapper::Allocate() {
+- Deallocate();
+- struct cudaExtent extent = make_cudaExtent(width_, height_, depth_);
+- cudaChannelFormatDesc fmt = cudaCreateChannelDesc();
+- CUDA_SAFE_CALL(cudaMalloc3DArray(&array_, &fmt, extent, cudaArrayLayered));
+-}
+-
+-template
+-void CudaArrayWrapper::Deallocate() {
+- if (array_ != nullptr) {
+- CUDA_SAFE_CALL(cudaFreeArray(array_));
+- array_ = nullptr;
+- }
+-}
+-
+-} // namespace mvs
+-} // namespace colmap
+-
+-#endif // COLMAP_SRC_MVS_CUDA_ARRAY_WRAPPER_H_
+diff --git a/src/mvs/cuda_texture.h b/src/mvs/cuda_texture.h
+new file mode 100644
+index 000000000..3dcd8d171
+--- /dev/null
++++ b/src/mvs/cuda_texture.h
+@@ -0,0 +1,180 @@
++// Copyright (c) 2023, ETH Zurich and UNC Chapel Hill.
++// All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions are met:
++//
++// * Redistributions of source code must retain the above copyright
++// notice, this list of conditions and the following disclaimer.
++//
++// * Redistributions in binary form must reproduce the above copyright
++// notice, this list of conditions and the following disclaimer in the
++// documentation and/or other materials provided with the distribution.
++//
++// * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
++// its contributors may be used to endorse or promote products derived
++// from this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
++// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
++// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
++// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
++// POSSIBILITY OF SUCH DAMAGE.
++//
++// Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
++
++#ifndef COLMAP_SRC_MVS_CUDA_TEXTURE_H_
++#define COLMAP_SRC_MVS_CUDA_TEXTURE_H_
++
++#include
++
++#include
++
++#include "mvs/gpu_mat.h"
++#include "util/cudacc.h"
++#include "util/logging.h"
++
++namespace colmap {
++namespace mvs {
++
++template
++class CudaArrayLayeredTexture {
++ public:
++ static std::unique_ptr> FromGpuMat(
++ const cudaTextureDesc& texture_desc, const GpuMat& mat);
++ static std::unique_ptr> FromHostArray(
++ const cudaTextureDesc& texture_desc, const size_t width,
++ const size_t height, const size_t depth, const T* data);
++
++ cudaTextureObject_t GetObj() const;
++
++ size_t GetWidth() const;
++ size_t GetHeight() const;
++ size_t GetDepth() const;
++
++ CudaArrayLayeredTexture(const cudaTextureDesc& texture_desc,
++ const size_t width, const size_t height,
++ const size_t depth);
++ ~CudaArrayLayeredTexture();
++
++ private:
++ // Define class as non-copyable and non-movable.
++ CudaArrayLayeredTexture(CudaArrayLayeredTexture const&) = delete;
++ void operator=(CudaArrayLayeredTexture const& obj) = delete;
++ CudaArrayLayeredTexture(CudaArrayLayeredTexture&&) = delete;
++
++ const size_t width_;
++ const size_t height_;
++ const size_t depth_;
++
++ cudaArray_t array_;
++ const cudaTextureDesc texture_desc_;
++ cudaResourceDesc resource_desc_;
++ cudaTextureObject_t texture_;
++};
++
++////////////////////////////////////////////////////////////////////////////////
++// Implementation
++////////////////////////////////////////////////////////////////////////////////
++
++template
++std::unique_ptr>
++CudaArrayLayeredTexture::FromGpuMat(const cudaTextureDesc& texture_desc,
++ const GpuMat& mat) {
++ auto array = std::make_unique>(
++ texture_desc, mat.GetWidth(), mat.GetHeight(), mat.GetDepth());
++
++ cudaMemcpy3DParms params;
++ memset(¶ms, 0, sizeof(params));
++ params.extent =
++ make_cudaExtent(mat.GetWidth(), mat.GetHeight(), mat.GetDepth());
++ params.kind = cudaMemcpyDeviceToDevice;
++ params.srcPtr = make_cudaPitchedPtr((void*)mat.GetPtr(), mat.GetPitch(),
++ mat.GetWidth(), mat.GetHeight());
++ params.dstArray = array->array_;
++ CUDA_SAFE_CALL(cudaMemcpy3D(¶ms));
++
++ return array;
++}
++
++template
++std::unique_ptr>
++CudaArrayLayeredTexture::FromHostArray(const cudaTextureDesc& texture_desc,
++ const size_t width,
++ const size_t height,
++ const size_t depth, const T* data) {
++ auto array = std::make_unique>(texture_desc, width,
++ height, depth);
++
++ cudaMemcpy3DParms params;
++ memset(¶ms, 0, sizeof(params));
++ params.extent = make_cudaExtent(width, height, depth);
++ params.kind = cudaMemcpyHostToDevice;
++ params.srcPtr =
++ make_cudaPitchedPtr((void*)data, width * sizeof(T), width, height);
++ params.dstArray = array->array_;
++ CUDA_SAFE_CALL(cudaMemcpy3D(¶ms));
++
++ return array;
++}
++
++template
++CudaArrayLayeredTexture::CudaArrayLayeredTexture(
++ const cudaTextureDesc& texture_desc, const size_t width,
++ const size_t height, const size_t depth)
++ : texture_desc_(texture_desc),
++ width_(width),
++ height_(height),
++ depth_(depth) {
++ CHECK_GT(width_, 0);
++ CHECK_GT(height_, 0);
++ CHECK_GT(depth_, 0);
++
++ cudaExtent extent = make_cudaExtent(width_, height_, depth_);
++ cudaChannelFormatDesc fmt = cudaCreateChannelDesc();
++ CUDA_SAFE_CALL(cudaMalloc3DArray(&array_, &fmt, extent, cudaArrayLayered));
++
++ memset(&resource_desc_, 0, sizeof(resource_desc_));
++ resource_desc_.resType = cudaResourceTypeArray;
++ resource_desc_.res.array.array = array_;
++
++ CUDA_SAFE_CALL(cudaCreateTextureObject(&texture_, &resource_desc_,
++ &texture_desc_, nullptr));
++}
++
++template
++CudaArrayLayeredTexture::~CudaArrayLayeredTexture() {
++ CUDA_SAFE_CALL(cudaFreeArray(array_));
++ CUDA_SAFE_CALL(cudaDestroyTextureObject(texture_));
++}
++
++template
++cudaTextureObject_t CudaArrayLayeredTexture::GetObj() const {
++ return texture_;
++}
++
++template
++size_t CudaArrayLayeredTexture::GetWidth() const {
++ return width_;
++}
++
++template
++size_t CudaArrayLayeredTexture::GetHeight() const {
++ return height_;
++}
++
++template
++size_t CudaArrayLayeredTexture::GetDepth() const {
++ return depth_;
++}
++
++} // namespace mvs
++} // namespace colmap
++
++#endif // COLMAP_SRC_MVS_CUDA_TEXTURE_H_
+diff --git a/src/mvs/gpu_mat_ref_image.cu b/src/mvs/gpu_mat_ref_image.cu
+index c40a10bc3..1e3cc9d5e 100644
+--- a/src/mvs/gpu_mat_ref_image.cu
++++ b/src/mvs/gpu_mat_ref_image.cu
+@@ -39,9 +39,8 @@ namespace colmap {
+ namespace mvs {
+ namespace {
+
+-texture image_texture;
+-
+-__global__ void FilterKernel(GpuMat image, GpuMat sum_image,
++__global__ void FilterKernel(const cudaTextureObject_t image_texture,
++ GpuMat image, GpuMat sum_image,
+ GpuMat squared_sum_image,
+ const int window_radius, const int window_step,
+ const float sigma_spatial,
+@@ -54,7 +53,7 @@ __global__ void FilterKernel(GpuMat image, GpuMat sum_image,
+
+ BilateralWeightComputer bilateral_weight_computer(sigma_spatial, sigma_color);
+
+- const float center_color = tex2D(image_texture, col, row);
++ const float center_color = tex2D(image_texture, col, row);
+
+ float color_sum = 0.0f;
+ float color_squared_sum = 0.0f;
+@@ -65,7 +64,7 @@ __global__ void FilterKernel(GpuMat image, GpuMat sum_image,
+ for (int window_col = -window_radius; window_col <= window_radius;
+ window_col += window_step) {
+ const float color =
+- tex2D(image_texture, col + window_col, row + window_row);
++ tex2D(image_texture, col + window_col, row + window_row);
+ const float bilateral_weight = bilateral_weight_computer.Compute(
+ window_row, window_col, center_color, color);
+ color_sum += bilateral_weight * color;
+@@ -95,24 +94,25 @@ void GpuMatRefImage::Filter(const uint8_t* image_data,
+ const size_t window_radius,
+ const size_t window_step, const float sigma_spatial,
+ const float sigma_color) {
+- CudaArrayWrapper image_array(width_, height_, 1);
+- image_array.CopyToDevice(image_data);
+- image_texture.addressMode[0] = cudaAddressModeBorder;
+- image_texture.addressMode[1] = cudaAddressModeBorder;
+- image_texture.addressMode[2] = cudaAddressModeBorder;
+- image_texture.filterMode = cudaFilterModePoint;
+- image_texture.normalized = false;
++ cudaTextureDesc texture_desc;
++ memset(&texture_desc, 0, sizeof(texture_desc));
++ texture_desc.addressMode[0] = cudaAddressModeBorder;
++ texture_desc.addressMode[1] = cudaAddressModeBorder;
++ texture_desc.addressMode[2] = cudaAddressModeBorder;
++ texture_desc.filterMode = cudaFilterModePoint;
++ texture_desc.readMode = cudaReadModeNormalizedFloat;
++ texture_desc.normalizedCoords = false;
++ auto image_texture = CudaArrayLayeredTexture::FromHostArray(
++ texture_desc, width_, height_, 1, image_data);
+
+ const dim3 block_size(kBlockDimX, kBlockDimY);
+ const dim3 grid_size((width_ - 1) / block_size.x + 1,
+ (height_ - 1) / block_size.y + 1);
+
+- CUDA_SAFE_CALL(cudaBindTextureToArray(image_texture, image_array.GetPtr()));
+ FilterKernel<<>>(
+- *image, *sum_image, *squared_sum_image, window_radius, window_step,
+- sigma_spatial, sigma_color);
++ image_texture->GetObj(), *image, *sum_image, *squared_sum_image,
++ window_radius, window_step, sigma_spatial, sigma_color);
+ CUDA_SYNC_AND_CHECK();
+- CUDA_SAFE_CALL(cudaUnbindTexture(image_texture));
+ }
+
+ } // namespace mvs
+diff --git a/src/mvs/gpu_mat_ref_image.h b/src/mvs/gpu_mat_ref_image.h
+index 1e04e5f43..4f4be34e6 100644
+--- a/src/mvs/gpu_mat_ref_image.h
++++ b/src/mvs/gpu_mat_ref_image.h
+@@ -34,7 +34,7 @@
+
+ #include
+
+-#include "mvs/cuda_array_wrapper.h"
++#include "mvs/cuda_texture.h"
+ #include "mvs/gpu_mat.h"
+
+ namespace colmap {
+@@ -64,8 +64,8 @@ class GpuMatRefImage {
+ const static size_t kBlockDimX = 16;
+ const static size_t kBlockDimY = 12;
+
+- size_t width_;
+- size_t height_;
++ const size_t width_;
++ const size_t height_;
+ };
+
+ struct BilateralWeightComputer {
+diff --git a/src/mvs/patch_match_cuda.cu b/src/mvs/patch_match_cuda.cu
+index 845fffa94..772f341a1 100644
+--- a/src/mvs/patch_match_cuda.cu
++++ b/src/mvs/patch_match_cuda.cu
+@@ -56,14 +56,6 @@
+ namespace colmap {
+ namespace mvs {
+
+-texture
+- ref_image_texture;
+-texture
+- src_images_texture;
+-texture
+- src_depth_maps_texture;
+-texture poses_texture;
+-
+ // Calibration of reference image as {fx, cx, fy, cy}.
+ __constant__ float ref_K[4];
+ // Calibration of reference image as {1/fx, -cx/fx, 1/fy, -cy/fy}.
+@@ -229,18 +221,17 @@ __device__ inline float PropagateDepth(const float depth1,
+ // First, compute triangulation angle between reference and source image for 3D
+ // point. Second, compute incident angle between viewing direction of source
+ // image and normal direction of 3D point. Both angles are cosine distances.
+-__device__ inline void ComputeViewingAngles(const float point[3],
+- const float normal[3],
+- const int image_idx,
+- float* cos_triangulation_angle,
+- float* cos_incident_angle) {
++__device__ inline void ComputeViewingAngles(
++ const cudaTextureObject_t poses_texture, const float point[3],
++ const float normal[3], const int image_idx, float* cos_triangulation_angle,
++ float* cos_incident_angle) {
+ *cos_triangulation_angle = 0.0f;
+ *cos_incident_angle = 0.0f;
+
+ // Projection center of source image.
+ float C[3];
+ for (int i = 0; i < 3; ++i) {
+- C[i] = tex2D(poses_texture, i + 16, image_idx);
++ C[i] = tex2D(poses_texture, i + 16, image_idx);
+ }
+
+ // Ray from point to camera.
+@@ -256,25 +247,25 @@ __device__ inline void ComputeViewingAngles(const float point[3],
+ *cos_triangulation_angle = DotProduct3(SX, point) * RX_inv_norm * SX_inv_norm;
+ }
+
+-__device__ inline void ComposeHomography(const int image_idx, const int row,
+- const int col, const float depth,
+- const float normal[3], float H[9]) {
++__device__ inline void ComposeHomography(
++ const cudaTextureObject_t poses_texture, const int image_idx, const int row,
++ const int col, const float depth, const float normal[3], float H[9]) {
+ // Calibration of source image.
+ float K[4];
+ for (int i = 0; i < 4; ++i) {
+- K[i] = tex2D(poses_texture, i, image_idx);
++ K[i] = tex2D(poses_texture, i, image_idx);
+ }
+
+ // Relative rotation between reference and source image.
+ float R[9];
+ for (int i = 0; i < 9; ++i) {
+- R[i] = tex2D(poses_texture, i + 4, image_idx);
++ R[i] = tex2D(poses_texture, i + 4, image_idx);
+ }
+
+ // Relative translation between reference and source image.
+ float T[3];
+ for (int i = 0; i < 3; ++i) {
+- T[i] = tex2D(poses_texture, i + 13, image_idx);
++ T[i] = tex2D(poses_texture, i + 13, image_idx);
+ }
+
+ // Distance to the plane.
+@@ -332,6 +323,9 @@ struct LocalRefImage {
+ const static int kNumColumns = kThreadBlockSize * THREADS_PER_BLOCK;
+ const static int kDataSize = kNumRows * kNumColumns;
+
++ __device__ explicit LocalRefImage(const cudaTextureObject_t ref_image_texture)
++ : ref_image_texture_(ref_image_texture) {}
++
+ float* data = nullptr;
+
+ __device__ inline void Read(const int row) {
+@@ -357,7 +351,7 @@ struct LocalRefImage {
+ #pragma unroll
+ for (int block = 0; block < kThreadBlockSize; ++block) {
+ data[local_row * kNumColumns + local_col] =
+- tex2D(ref_image_texture, global_col, global_row);
++ tex2D(ref_image_texture_, global_col, global_row);
+ local_col += THREADS_PER_BLOCK;
+ global_col += THREADS_PER_BLOCK;
+ }
+@@ -382,12 +376,15 @@ struct LocalRefImage {
+ #pragma unroll
+ for (int block = 0; block < kThreadBlockSize; ++block) {
+ data[local_row * kNumColumns + local_col] =
+- tex2D(ref_image_texture, global_col, global_row);
++ tex2D(ref_image_texture_, global_col, global_row);
+ local_col += THREADS_PER_BLOCK;
+ global_col += THREADS_PER_BLOCK;
+ }
+ }
+ }
++
++ private:
++ const cudaTextureObject_t ref_image_texture_;
+ };
+
+ // The return values is 1 - NCC, so the range is [0, 2], the smaller the
+@@ -396,9 +393,15 @@ template
+ struct PhotoConsistencyCostComputer {
+ const static int kWindowRadius = kWindowSize / 2;
+
+- __device__ PhotoConsistencyCostComputer(const float sigma_spatial,
+- const float sigma_color)
+- : bilateral_weight_computer_(sigma_spatial, sigma_color) {}
++ __device__ PhotoConsistencyCostComputer(
++ const cudaTextureObject_t ref_image_texture,
++ const cudaTextureObject_t src_images_texture,
++ const cudaTextureObject_t poses_texture, const float sigma_spatial,
++ const float sigma_color)
++ : local_ref_image(ref_image_texture),
++ src_images_texture_(src_images_texture),
++ poses_texture_(poses_texture),
++ bilateral_weight_computer_(sigma_spatial, sigma_color) {}
+
+ // Maximum photo consistency cost as 1 - min(NCC).
+ const float kMaxCost = 2.0f;
+@@ -429,7 +432,8 @@ struct PhotoConsistencyCostComputer {
+
+ __device__ inline float Compute() const {
+ float tform[9];
+- ComposeHomography(src_image_idx, row, col, depth, normal, tform);
++ ComposeHomography(poses_texture_, src_image_idx, row, col, depth, normal,
++ tform);
+
+ float tform_step[8];
+ for (int i = 0; i < 8; ++i) {
+@@ -467,8 +471,8 @@ struct PhotoConsistencyCostComputer {
+ const float norm_col_src = inv_z * col_src + 0.5f;
+ const float norm_row_src = inv_z * row_src + 0.5f;
+ const float ref_color = local_ref_image.data[ref_image_idx];
+- const float src_color = tex2DLayered(src_images_texture, norm_col_src,
+- norm_row_src, src_image_idx);
++ const float src_color = tex2DLayered(
++ src_images_texture_, norm_col_src, norm_row_src, src_image_idx);
+
+ const float bilateral_weight = bilateral_weight_computer_.Compute(
+ row, col, ref_center_color, ref_color);
+@@ -528,22 +532,24 @@ struct PhotoConsistencyCostComputer {
+ }
+
+ private:
++ const cudaTextureObject_t src_images_texture_;
++ const cudaTextureObject_t poses_texture_;
+ const BilateralWeightComputer bilateral_weight_computer_;
+ };
+
+-__device__ inline float ComputeGeomConsistencyCost(const float row,
+- const float col,
+- const float depth,
+- const int image_idx,
+- const float max_cost) {
++__device__ inline float ComputeGeomConsistencyCost(
++ const cudaTextureObject_t poses_texture,
++ const cudaTextureObject_t src_depth_maps_texture, const float row,
++ const float col, const float depth, const int image_idx,
++ const float max_cost) {
+ // Extract projection matrices for source image.
+ float P[12];
+ for (int i = 0; i < 12; ++i) {
+- P[i] = tex2D(poses_texture, i + 19, image_idx);
++ P[i] = tex2D(poses_texture, i + 19, image_idx);
+ }
+ float inv_P[12];
+ for (int i = 0; i < 12; ++i) {
+- inv_P[i] = tex2D(poses_texture, i + 31, image_idx);
++ inv_P[i] = tex2D(poses_texture, i + 31, image_idx);
+ }
+
+ // Project point in reference image to world.
+@@ -562,8 +568,8 @@ __device__ inline float ComputeGeomConsistencyCost(const float row,
+ P[6] * forward_point[2] + P[7]);
+
+ // Extract depth in source image.
+- const float src_depth = tex2DLayered(src_depth_maps_texture, src_col + 0.5f,
+- src_row + 0.5f, image_idx);
++ const float src_depth = tex2DLayered(
++ src_depth_maps_texture, src_col + 0.5f, src_row + 0.5f, image_idx);
+
+ // Projection outside of source image.
+ if (src_depth == 0.0f) {
+@@ -794,15 +800,20 @@ template
+ __global__ void ComputeInitialCost(GpuMat cost_map,
+ const GpuMat depth_map,
+ const GpuMat normal_map,
++ const cudaTextureObject_t ref_image_texture,
+ const GpuMat ref_sum_image,
+ const GpuMat ref_squared_sum_image,
++ const cudaTextureObject_t src_images_texture,
++ const cudaTextureObject_t poses_texture,
+ const float sigma_spatial,
+ const float sigma_color) {
+ const int col = blockDim.x * blockIdx.x + threadIdx.x;
+
+ typedef PhotoConsistencyCostComputer
+ PhotoConsistencyCostComputerType;
+- PhotoConsistencyCostComputerType pcc_computer(sigma_spatial, sigma_color);
++ PhotoConsistencyCostComputerType pcc_computer(
++ ref_image_texture, src_images_texture, poses_texture, sigma_spatial,
++ sigma_color);
+ pcc_computer.col = col;
+
+ __shared__ float local_ref_image_data
+@@ -859,8 +870,13 @@ __global__ void SweepFromTopToBottom(
+ GpuMat global_workspace, GpuMat rand_state_map,
+ GpuMat cost_map, GpuMat depth_map, GpuMat normal_map,
+ GpuMat consistency_mask, GpuMat sel_prob_map,
+- const GpuMat prev_sel_prob_map, const GpuMat ref_sum_image,
+- const GpuMat ref_squared_sum_image, const SweepOptions options) {
++ const GpuMat prev_sel_prob_map,
++ const cudaTextureObject_t ref_image_texture,
++ const GpuMat ref_sum_image,
++ const GpuMat ref_squared_sum_image,
++ const cudaTextureObject_t src_images_texture,
++ const cudaTextureObject_t src_depth_maps_texture,
++ const cudaTextureObject_t poses_texture, const SweepOptions options) {
+ const int col = blockDim.x * blockIdx.x + threadIdx.x;
+
+ // Probability for boundary pixels.
+@@ -904,8 +920,9 @@ __global__ void SweepFromTopToBottom(
+
+ typedef PhotoConsistencyCostComputer
+ PhotoConsistencyCostComputerType;
+- PhotoConsistencyCostComputerType pcc_computer(options.sigma_spatial,
+- options.sigma_color);
++ PhotoConsistencyCostComputerType pcc_computer(
++ ref_image_texture, src_images_texture, poses_texture,
++ options.sigma_spatial, options.sigma_color);
+ pcc_computer.col = col;
+
+ __shared__ float local_ref_image_data
+@@ -982,16 +999,17 @@ __global__ void SweepFromTopToBottom(
+
+ float cos_triangulation_angle;
+ float cos_incident_angle;
+- ComputeViewingAngles(point, curr_param_state.normal, image_idx,
+- &cos_triangulation_angle, &cos_incident_angle);
++ ComputeViewingAngles(poses_texture, point, curr_param_state.normal,
++ image_idx, &cos_triangulation_angle,
++ &cos_incident_angle);
+ const float tri_prob =
+ likelihood_computer.ComputeTriProb(cos_triangulation_angle);
+ const float inc_prob =
+ likelihood_computer.ComputeIncProb(cos_incident_angle);
+
+ float H[9];
+- ComposeHomography(image_idx, row, col, curr_param_state.depth,
+- curr_param_state.normal, H);
++ ComposeHomography(poses_texture, image_idx, row, col,
++ curr_param_state.depth, curr_param_state.normal, H);
+ const float res_prob =
+ likelihood_computer.ComputeResolutionProb(H, row, col);
+
+@@ -1035,10 +1053,11 @@ __global__ void SweepFromTopToBottom(
+
+ costs[0] += cost_map.Get(row, col, pcc_computer.src_image_idx);
+ if (kGeomConsistencyTerm) {
+- costs[0] += options.geom_consistency_regularizer *
+- ComputeGeomConsistencyCost(
+- row, col, depths[0], pcc_computer.src_image_idx,
+- options.geom_consistency_max_cost);
++ costs[0] +=
++ options.geom_consistency_regularizer *
++ ComputeGeomConsistencyCost(
++ poses_texture, src_depth_maps_texture, row, col, depths[0],
++ pcc_computer.src_image_idx, options.geom_consistency_max_cost);
+ }
+
+ for (int i = 1; i < kNumCosts; ++i) {
+@@ -1048,7 +1067,8 @@ __global__ void SweepFromTopToBottom(
+ if (kGeomConsistencyTerm) {
+ costs[i] += options.geom_consistency_regularizer *
+ ComputeGeomConsistencyCost(
+- row, col, depths[i], pcc_computer.src_image_idx,
++ poses_texture, src_depth_maps_texture, row, col,
++ depths[i], pcc_computer.src_image_idx,
+ options.geom_consistency_max_cost);
+ }
+ }
+@@ -1102,7 +1122,7 @@ __global__ void SweepFromTopToBottom(
+ for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) {
+ float cos_triangulation_angle;
+ float cos_incident_angle;
+- ComputeViewingAngles(best_point, best_normal, image_idx,
++ ComputeViewingAngles(poses_texture, best_point, best_normal, image_idx,
+ &cos_triangulation_angle, &cos_incident_angle);
+ if (cos_triangulation_angle > cos_min_triangulation_angle ||
+ cos_incident_angle <= 0.0f) {
+@@ -1115,7 +1135,8 @@ __global__ void SweepFromTopToBottom(
+ num_consistent += 1;
+ }
+ } else if (!kFilterPhotoConsistency) {
+- if (ComputeGeomConsistencyCost(row, col, best_depth, image_idx,
++ if (ComputeGeomConsistencyCost(poses_texture, src_depth_maps_texture,
++ row, col, best_depth, image_idx,
+ options.geom_consistency_max_cost) <=
+ options.filter_geom_consistency_max_cost) {
+ consistency_mask.Set(row, col, image_idx, 1);
+@@ -1123,7 +1144,8 @@ __global__ void SweepFromTopToBottom(
+ }
+ } else {
+ if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob &&
+- ComputeGeomConsistencyCost(row, col, best_depth, image_idx,
++ ComputeGeomConsistencyCost(poses_texture, src_depth_maps_texture,
++ row, col, best_depth, image_idx,
+ options.geom_consistency_max_cost) <=
+ options.filter_geom_consistency_max_cost) {
+ consistency_mask.Set(row, col, image_idx, 1);
+@@ -1169,53 +1191,49 @@ PatchMatchCuda::PatchMatchCuda(const PatchMatchOptions& options,
+ InitWorkspaceMemory();
+ }
+
+-PatchMatchCuda::~PatchMatchCuda() {
+- for (size_t i = 0; i < 4; ++i) {
+- poses_device_[i].reset();
+- }
+-}
+-
+ void PatchMatchCuda::Run() {
+ #define CASE_WINDOW_RADIUS(window_radius, window_step) \
+ case window_radius: \
+ RunWithWindowSizeAndStep<2 * window_radius + 1, window_step>(); \
+ break;
+
+-#define CASE_WINDOW_STEP(window_step) \
+- case window_step: \
+- switch (options_.window_radius) { \
+- CASE_WINDOW_RADIUS(1, window_step) \
+- CASE_WINDOW_RADIUS(2, window_step) \
+- CASE_WINDOW_RADIUS(3, window_step) \
+- CASE_WINDOW_RADIUS(4, window_step) \
+- CASE_WINDOW_RADIUS(5, window_step) \
+- CASE_WINDOW_RADIUS(6, window_step) \
+- CASE_WINDOW_RADIUS(7, window_step) \
+- CASE_WINDOW_RADIUS(8, window_step) \
+- CASE_WINDOW_RADIUS(9, window_step) \
+- CASE_WINDOW_RADIUS(10, window_step) \
+- CASE_WINDOW_RADIUS(11, window_step) \
+- CASE_WINDOW_RADIUS(12, window_step) \
+- CASE_WINDOW_RADIUS(13, window_step) \
+- CASE_WINDOW_RADIUS(14, window_step) \
+- CASE_WINDOW_RADIUS(15, window_step) \
+- CASE_WINDOW_RADIUS(16, window_step) \
+- CASE_WINDOW_RADIUS(17, window_step) \
+- CASE_WINDOW_RADIUS(18, window_step) \
+- CASE_WINDOW_RADIUS(19, window_step) \
+- CASE_WINDOW_RADIUS(20, window_step) \
+- default: { \
+- std::cerr << "Error: Window size not supported" << std::endl; \
+- break; \
+- } \
+- } \
++#define CASE_WINDOW_STEP(window_step) \
++ case window_step: \
++ switch (options_.window_radius) { \
++ CASE_WINDOW_RADIUS(1, window_step) \
++ CASE_WINDOW_RADIUS(2, window_step) \
++ CASE_WINDOW_RADIUS(3, window_step) \
++ CASE_WINDOW_RADIUS(4, window_step) \
++ CASE_WINDOW_RADIUS(5, window_step) \
++ CASE_WINDOW_RADIUS(6, window_step) \
++ CASE_WINDOW_RADIUS(7, window_step) \
++ CASE_WINDOW_RADIUS(8, window_step) \
++ CASE_WINDOW_RADIUS(9, window_step) \
++ CASE_WINDOW_RADIUS(10, window_step) \
++ CASE_WINDOW_RADIUS(11, window_step) \
++ CASE_WINDOW_RADIUS(12, window_step) \
++ CASE_WINDOW_RADIUS(13, window_step) \
++ CASE_WINDOW_RADIUS(14, window_step) \
++ CASE_WINDOW_RADIUS(15, window_step) \
++ CASE_WINDOW_RADIUS(16, window_step) \
++ CASE_WINDOW_RADIUS(17, window_step) \
++ CASE_WINDOW_RADIUS(18, window_step) \
++ CASE_WINDOW_RADIUS(19, window_step) \
++ CASE_WINDOW_RADIUS(20, window_step) \
++ default: { \
++ std::cerr << "Error: Window size " << options_.window_radius \
++ << " not supported" << std::endl; \
++ break; \
++ } \
++ } \
+ break;
+
+ switch (options_.window_step) {
+ CASE_WINDOW_STEP(1)
+ CASE_WINDOW_STEP(2)
+ default: {
+- std::cerr << "Error: Window step not supported" << std::endl;
++ std::cerr << "Error: Window step " << options_.window_step
++ << " not supported" << std::endl;
+ break;
+ }
+ }
+@@ -1274,9 +1292,10 @@ void PatchMatchCuda::RunWithWindowSizeAndStep() {
+ ComputeCudaConfig();
+ ComputeInitialCost
+ <<>>(
+- *cost_map_, *depth_map_, *normal_map_, *ref_image_->sum_image,
+- *ref_image_->squared_sum_image, options_.sigma_spatial,
+- options_.sigma_color);
++ *cost_map_, *depth_map_, *normal_map_, ref_image_texture_->GetObj(),
++ *ref_image_->sum_image, *ref_image_->squared_sum_image,
++ src_images_texture_->GetObj(), poses_texture_[0]->GetObj(),
++ options_.sigma_spatial, options_.sigma_color);
+ CUDA_SYNC_AND_CHECK();
+
+ init_timer.Print("Initialization");
+@@ -1324,8 +1343,13 @@ void PatchMatchCuda::RunWithWindowSizeAndStep() {
+ <<>>( \
+ *global_workspace_, *rand_state_map_, *cost_map_, *depth_map_, \
+ *normal_map_, *consistency_mask_, *sel_prob_map_, \
+- *prev_sel_prob_map_, *ref_image_->sum_image, \
+- *ref_image_->squared_sum_image, sweep_options);
++ *prev_sel_prob_map_, ref_image_texture_->GetObj(), \
++ *ref_image_->sum_image, *ref_image_->squared_sum_image, \
++ src_images_texture_->GetObj(), \
++ src_depth_maps_texture_ == nullptr \
++ ? 0 \
++ : src_depth_maps_texture_->GetObj(), \
++ poses_texture_[rotation_in_half_pi_]->GetObj(), sweep_options);
+
+ if (last_sweep) {
+ if (options_.filter) {
+@@ -1410,13 +1434,26 @@ void PatchMatchCuda::ComputeCudaConfig() {
+ elem_wise_grid_size_.z = 1;
+ }
+
++void PatchMatchCuda::BindRefImageTexture() {
++ cudaTextureDesc texture_desc;
++ memset(&texture_desc, 0, sizeof(texture_desc));
++ texture_desc.addressMode[0] = cudaAddressModeBorder;
++ texture_desc.addressMode[1] = cudaAddressModeBorder;
++ texture_desc.addressMode[2] = cudaAddressModeBorder;
++ texture_desc.filterMode = cudaFilterModePoint;
++ texture_desc.readMode = cudaReadModeNormalizedFloat;
++ texture_desc.normalizedCoords = false;
++ ref_image_texture_ = CudaArrayLayeredTexture::FromGpuMat(
++ texture_desc, *ref_image_->image);
++}
++
+ void PatchMatchCuda::InitRefImage() {
+ const Image& ref_image = problem_.images->at(problem_.ref_image_idx);
+
+ ref_width_ = ref_image.GetWidth();
+ ref_height_ = ref_image.GetHeight();
+
+- // Upload to device.
++ // Upload to device and filter.
+ ref_image_.reset(new GpuMatRefImage(ref_width_, ref_height_));
+ const std::vector ref_image_array =
+ ref_image.GetBitmap().ConvertToRowMajorArray();
+@@ -1424,18 +1461,7 @@ void PatchMatchCuda::InitRefImage() {
+ options_.window_step, options_.sigma_spatial,
+ options_.sigma_color);
+
+- ref_image_device_.reset(
+- new CudaArrayWrapper(ref_width_, ref_height_, 1));
+- ref_image_device_->CopyFromGpuMat(*ref_image_->image);
+-
+- // Create texture.
+- ref_image_texture.addressMode[0] = cudaAddressModeBorder;
+- ref_image_texture.addressMode[1] = cudaAddressModeBorder;
+- ref_image_texture.addressMode[2] = cudaAddressModeBorder;
+- ref_image_texture.filterMode = cudaFilterModePoint;
+- ref_image_texture.normalized = false;
+- CUDA_SAFE_CALL(
+- cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr()));
++ BindRefImageTexture();
+ }
+
+ void PatchMatchCuda::InitSourceImages() {
+@@ -1470,19 +1496,18 @@ void PatchMatchCuda::InitSourceImages() {
+ }
+ }
+
+- // Upload to device.
+- src_images_device_.reset(new CudaArrayWrapper(
+- max_width, max_height, problem_.src_image_idxs.size()));
+- src_images_device_->CopyToDevice(src_images_host_data.data());
+-
+ // Create source images texture.
+- src_images_texture.addressMode[0] = cudaAddressModeBorder;
+- src_images_texture.addressMode[1] = cudaAddressModeBorder;
+- src_images_texture.addressMode[2] = cudaAddressModeBorder;
+- src_images_texture.filterMode = cudaFilterModeLinear;
+- src_images_texture.normalized = false;
+- CUDA_SAFE_CALL(cudaBindTextureToArray(src_images_texture,
+- src_images_device_->GetPtr()));
++ cudaTextureDesc texture_desc;
++ memset(&texture_desc, 0, sizeof(texture_desc));
++ texture_desc.addressMode[0] = cudaAddressModeBorder;
++ texture_desc.addressMode[1] = cudaAddressModeBorder;
++ texture_desc.addressMode[2] = cudaAddressModeBorder;
++ texture_desc.filterMode = cudaFilterModeLinear;
++ texture_desc.readMode = cudaReadModeNormalizedFloat;
++ texture_desc.normalizedCoords = false;
++ src_images_texture_ = CudaArrayLayeredTexture::FromHostArray(
++ texture_desc, max_width, max_height, problem_.src_image_idxs.size(),
++ src_images_host_data.data());
+ }
+
+ // Upload source depth maps to device.
+@@ -1504,19 +1529,18 @@ void PatchMatchCuda::InitSourceImages() {
+ }
+ }
+
+- src_depth_maps_device_.reset(new CudaArrayWrapper(
+- max_width, max_height, problem_.src_image_idxs.size()));
+- src_depth_maps_device_->CopyToDevice(src_depth_maps_host_data.data());
+-
+ // Create source depth maps texture.
+- src_depth_maps_texture.addressMode[0] = cudaAddressModeBorder;
+- src_depth_maps_texture.addressMode[1] = cudaAddressModeBorder;
+- src_depth_maps_texture.addressMode[2] = cudaAddressModeBorder;
+- // TODO: Check if linear interpolation improves results or not.
+- src_depth_maps_texture.filterMode = cudaFilterModePoint;
+- src_depth_maps_texture.normalized = false;
+- CUDA_SAFE_CALL(cudaBindTextureToArray(src_depth_maps_texture,
+- src_depth_maps_device_->GetPtr()));
++ cudaTextureDesc texture_desc;
++ memset(&texture_desc, 0, sizeof(texture_desc));
++ texture_desc.addressMode[0] = cudaAddressModeBorder;
++ texture_desc.addressMode[1] = cudaAddressModeBorder;
++ texture_desc.addressMode[2] = cudaAddressModeBorder;
++ texture_desc.filterMode = cudaFilterModePoint;
++ texture_desc.readMode = cudaReadModeElementType;
++ texture_desc.normalizedCoords = false;
++ src_depth_maps_texture_ = CudaArrayLayeredTexture::FromHostArray(
++ texture_desc, max_width, max_height, problem_.src_image_idxs.size(),
++ src_depth_maps_host_data.data());
+ }
+ }
+
+@@ -1576,6 +1600,15 @@ void PatchMatchCuda::InitTransforms() {
+ // Matrix for 90deg rotation around Z-axis in counter-clockwise direction.
+ const float R_z90[9] = {0, 1, 0, -1, 0, 0, 0, 0, 1};
+
++ cudaTextureDesc texture_desc;
++ memset(&texture_desc, 0, sizeof(texture_desc));
++ texture_desc.addressMode[0] = cudaAddressModeBorder;
++ texture_desc.addressMode[1] = cudaAddressModeBorder;
++ texture_desc.addressMode[2] = cudaAddressModeBorder;
++ texture_desc.filterMode = cudaFilterModePoint;
++ texture_desc.readMode = cudaReadModeElementType;
++ texture_desc.normalizedCoords = false;
++
+ for (size_t i = 0; i < 4; ++i) {
+ const size_t kNumTformParams = 4 + 9 + 3 + 3 + 12 + 12;
+ std::vector poses_host_data(kNumTformParams *
+@@ -1614,20 +1647,12 @@ void PatchMatchCuda::InitTransforms() {
+ offset += 12;
+ }
+
+- poses_device_[i].reset(new CudaArrayWrapper(
+- kNumTformParams, problem_.src_image_idxs.size(), 1));
+- poses_device_[i]->CopyToDevice(poses_host_data.data());
++ poses_texture_[i] = CudaArrayLayeredTexture::FromHostArray(
++ texture_desc, kNumTformParams, problem_.src_image_idxs.size(), 1,
++ poses_host_data.data());
+
+ RotatePose(R_z90, rotated_R, rotated_T);
+ }
+-
+- poses_texture.addressMode[0] = cudaAddressModeBorder;
+- poses_texture.addressMode[1] = cudaAddressModeBorder;
+- poses_texture.addressMode[2] = cudaAddressModeBorder;
+- poses_texture.filterMode = cudaFilterModePoint;
+- poses_texture.normalized = false;
+- CUDA_SAFE_CALL(
+- cudaBindTextureToArray(poses_texture, poses_device_[0]->GetPtr()));
+ }
+
+ void PatchMatchCuda::InitWorkspaceMemory() {
+@@ -1727,15 +1752,9 @@ void PatchMatchCuda::Rotate() {
+ ref_image_->squared_sum_image->Rotate(
+ rotated_ref_image->squared_sum_image.get());
+ ref_image_.swap(rotated_ref_image);
++ BindRefImageTexture();
+ }
+
+- // Bind rotated reference image to texture.
+- ref_image_device_.reset(new CudaArrayWrapper(width, height, 1));
+- ref_image_device_->CopyFromGpuMat(*ref_image_->image);
+- CUDA_SAFE_CALL(cudaUnbindTexture(ref_image_texture));
+- CUDA_SAFE_CALL(
+- cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr()));
+-
+ // Rotate selection probability map.
+ prev_sel_prob_map_.reset(
+ new GpuMat(width, height, problem_.src_image_idxs.size()));
+@@ -1751,11 +1770,6 @@ void PatchMatchCuda::Rotate() {
+ cost_map_.swap(rotated_cost_map);
+ }
+
+- // Rotate transformations.
+- CUDA_SAFE_CALL(cudaUnbindTexture(poses_texture));
+- CUDA_SAFE_CALL(cudaBindTextureToArray(
+- poses_texture, poses_device_[rotation_in_half_pi_]->GetPtr()));
+-
+ // Rotate calibration.
+ CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[rotation_in_half_pi_],
+ sizeof(float) * 4, 0,
+diff --git a/src/mvs/patch_match_cuda.h b/src/mvs/patch_match_cuda.h
+index adbecdbd9..9e85e9b32 100644
+--- a/src/mvs/patch_match_cuda.h
++++ b/src/mvs/patch_match_cuda.h
+@@ -38,7 +38,7 @@
+
+ #include
+
+-#include "mvs/cuda_array_wrapper.h"
++#include "mvs/cuda_texture.h"
+ #include "mvs/depth_map.h"
+ #include "mvs/gpu_mat.h"
+ #include "mvs/gpu_mat_prng.h"
+@@ -54,7 +54,6 @@ class PatchMatchCuda {
+ public:
+ PatchMatchCuda(const PatchMatchOptions& options,
+ const PatchMatch::Problem& problem);
+- ~PatchMatchCuda();
+
+ void Run();
+
+@@ -69,6 +68,8 @@ class PatchMatchCuda {
+
+ void ComputeCudaConfig();
+
++ void BindRefImageTexture();
++
+ void InitRefImage();
+ void InitSourceImages();
+ void InitTransforms();
+@@ -96,9 +97,9 @@ class PatchMatchCuda {
+ int rotation_in_half_pi_;
+
+ // Reference and source image input data.
+- std::unique_ptr> ref_image_device_;
+- std::unique_ptr> src_images_device_;
+- std::unique_ptr> src_depth_maps_device_;
++ std::unique_ptr> ref_image_texture_;
++ std::unique_ptr> src_images_texture_;
++ std::unique_ptr> src_depth_maps_texture_;
+
+ // Relative poses from rotated versions of reference image to source images
+ // corresponding to _rotationInHalfPi:
+@@ -114,7 +115,7 @@ class PatchMatchCuda {
+ // R, T, C, P, P^-1 denote the relative rotation, translation, camera
+ // center, projection, and inverse projection from there reference to the
+ // i-th source image.
+- std::unique_ptr> poses_device_[4];
++ std::unique_ptr> poses_texture_[4];
+
+ // Calibration matrix for rotated versions of reference image
+ // as {K[0, 0], K[0, 2], K[1, 1], K[1, 2]} corresponding to _rotationInHalfPi.
diff --git a/recipe/1823.patch b/recipe/1823.patch
new file mode 100644
index 0000000..2c31a89
--- /dev/null
+++ b/recipe/1823.patch
@@ -0,0 +1,39 @@
+From 821a85ba0d96ce5f53a15d8a538de268b891b3d0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?=
+Date: Wed, 1 Mar 2023 22:10:07 +0100
+Subject: [PATCH] Remove unused SIFT GPU cuda texture reference
+
+---
+ lib/SiftGPU/ProgramCU.cu | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/lib/SiftGPU/ProgramCU.cu b/lib/SiftGPU/ProgramCU.cu
+index 51c781341..9d842770d 100644
+--- a/lib/SiftGPU/ProgramCU.cu
++++ b/lib/SiftGPU/ProgramCU.cu
+@@ -1683,9 +1683,6 @@ void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
+ MatH, hdistmax, MatF, fdistmax);
+ }
+
+-
+-texture texDOT;
+-
+ #define ROWMATCH_BLOCK_WIDTH 32
+ #define ROWMATCH_BLOCK_HEIGHT 1
+
+@@ -1755,15 +1752,12 @@ void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float dist
+ int num2 = texDot->GetImgWidth();
+ dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
+ dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);
+- // texDot->BindTexture(texDOT);
+ RowMatch_Kernel<<>>((int*)texDot->_cuData,
+ (int*)texMatch->_cuData, num2, distmax, ratiomax);
+ }
+
+ #define COLMATCH_BLOCK_WIDTH 32
+
+-//texture texCT;
+-
+ void __global__ ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)
+ {
+ int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
diff --git a/recipe/1838.patch b/recipe/1838.patch
new file mode 100644
index 0000000..84ae322
--- /dev/null
+++ b/recipe/1838.patch
@@ -0,0 +1,1148 @@
+From d361730f19a675e1f60b3fe45333441deb0e3d1d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Johannes=20Sch=C3=B6nberger?=
+Date: Sat, 11 Mar 2023 22:28:18 +0100
+Subject: [PATCH 01/14] [WIP] Upgrade SiftGPU to use CUDA texture objects
+
+---
+ lib/SiftGPU/CuTexImage.cpp | 54 +++++++++++++++++++-
+ lib/SiftGPU/CuTexImage.h | 12 ++++-
+ lib/SiftGPU/ProgramCU.cu | 101 ++++++++++++++++++++-----------------
+ 3 files changed, 118 insertions(+), 49 deletions(-)
+
+diff --git a/lib/SiftGPU/CuTexImage.cpp b/lib/SiftGPU/CuTexImage.cpp
+index a4ef59bba..be0383d2b 100644
+--- a/lib/SiftGPU/CuTexImage.cpp
++++ b/lib/SiftGPU/CuTexImage.cpp
+@@ -27,6 +27,7 @@
+ #include
+ #include
+ #include
++#include
+ using namespace std;
+
+
+@@ -39,10 +40,48 @@ using namespace std;
+ #include "CuTexImage.h"
+ #include "ProgramCU.h"
+
+-#if CUDA_VERSION <= 2010 && defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
+-#error "Require CUDA 2.2 or higher"
+-#endif
++CuTexImage::CuTexObj::~CuTexObj()
++{
++ cudaDestroyTextureObject(handle);
++}
++
++CuTexImage::CuTexObj CuTexImage::BindTexture(const cudaTextureDesc& textureDesc,
++ const cudaChannelFormatDesc& channelFmtDesc)
++{
++ CuTexObj texObj;
+
++ cudaResourceDesc resourceDesc;
++ memset(&resourceDesc, 0, sizeof(resourceDesc));
++ resourceDesc.resType = cudaResourceTypeLinear;
++ resourceDesc.res.linear.devPtr = _cuData;
++ resourceDesc.res.linear.desc = channelFmtDesc;
++ resourceDesc.res.linear.sizeInBytes = _numBytes;
++
++ cudaCreateTextureObject(&texObj.handle, &resourceDesc, &textureDesc, nullptr);
++ ProgramCU::CheckErrorCUDA("CuTexImage::BindTexture");
++
++ return texObj;
++}
++
++CuTexImage::CuTexObj CuTexImage::BindTexture2D(const cudaTextureDesc& textureDesc,
++ const cudaChannelFormatDesc& channelFmtDesc)
++{
++ CuTexObj texObj;
++
++ cudaResourceDesc resourceDesc;
++ memset(&resourceDesc, 0, sizeof(resourceDesc));
++ resourceDesc.resType = cudaResourceTypePitch2D;
++ resourceDesc.res.pitch2D.devPtr = _cuData;
++ resourceDesc.res.pitch2D.width = _imgWidth;
++ resourceDesc.res.pitch2D.height = _imgHeight;
++ resourceDesc.res.pitch2D.pitchInBytes = _imgWidth * _numChannel * sizeof(float);
++ resourceDesc.res.pitch2D.desc = channelFmtDesc;
++
++ cudaCreateTextureObject(&texObj.handle, &resourceDesc, &textureDesc, nullptr);
++ ProgramCU::CheckErrorCUDA("CuTexImage::BindTexture2D");
++
++ return texObj;
++}
+
+ CuTexImage::CuTexImage()
+ {
+@@ -171,69 +210,6 @@ void CuTexImage::CopyToHost(void * buf, int stream)
+ cudaMemcpyAsync(buf, _cuData, _imgWidth * _imgHeight * _numChannel * sizeof(float), cudaMemcpyDeviceToHost, (cudaStream_t)stream);
+ }
+
+-void CuTexImage::InitTexture2D()
+-{
+-#if !defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
+- if(_cuData2D && (_texWidth < _imgWidth || _texHeight < _imgHeight))
+- {
+- cudaFreeArray(_cuData2D);
+- _cuData2D = NULL;
+- }
+-
+- if(_cuData2D == NULL)
+- {
+- _texWidth = max(_texWidth, _imgWidth);
+- _texHeight = max(_texHeight, _imgHeight);
+- cudaChannelFormatDesc desc;
+- desc.f = cudaChannelFormatKindFloat;
+- desc.x = sizeof(float) * 8;
+- desc.y = _numChannel >=2 ? sizeof(float) * 8 : 0;
+- desc.z = _numChannel >=3 ? sizeof(float) * 8 : 0;
+- desc.w = _numChannel >=4 ? sizeof(float) * 8 : 0;
+- const cudaError_t status = cudaMallocArray(&_cuData2D, &desc, _texWidth, _texHeight);
+-
+- if (status != cudaSuccess) {
+- _cuData = NULL;
+- _numBytes = 0;
+- }
+-
+- ProgramCU::CheckErrorCUDA("CuTexImage::InitTexture2D");
+- }
+-#endif
+-}
+-
+-void CuTexImage::CopyToTexture2D()
+-{
+-#if !defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
+- InitTexture2D();
+-
+- if(_cuData2D)
+- {
+- cudaMemcpy2DToArray(_cuData2D, 0, 0, _cuData, _imgWidth* _numChannel* sizeof(float) ,
+- _imgWidth * _numChannel*sizeof(float), _imgHeight, cudaMemcpyDeviceToDevice);
+- ProgramCU::CheckErrorCUDA("cudaMemcpy2DToArray");
+- }
+-#endif
+-
+-}
+-
+-int CuTexImage::DebugCopyToTexture2D()
+-{
+-
+-/* CuTexImage tex;
+- float data1[2][3] = {{1, 2, 5}, {3, 4, 5}}, data2[2][5];
+- tex.InitTexture(3, 2, 1);
+- cudaMemcpy(tex._cuData, data1[0], 6 * sizeof(float), cudaMemcpyHostToDevice);
+- cudaMemcpy(data1, tex._cuData, 4 * sizeof(float) , cudaMemcpyDeviceToHost);
+- tex._texWidth =5; tex._texHeight = 2;
+- tex.CopyToTexture2D();
+- cudaMemcpyFromArray(data2[0], tex._cuData2D, 0, 0, 10 * sizeof(float), cudaMemcpyDeviceToHost);*/
+-
+- return 1;
+-}
+-
+-
+-
+ void CuTexImage::CopyFromPBO(int width, int height, GLuint pbo)
+ {
+ void* pbuf =NULL;
+diff --git a/lib/SiftGPU/CuTexImage.h b/lib/SiftGPU/CuTexImage.h
+index 0d62f6d07..1303b24cc 100644
+--- a/lib/SiftGPU/CuTexImage.h
++++ b/lib/SiftGPU/CuTexImage.h
+@@ -24,13 +24,9 @@
+ #ifndef CU_TEX_IMAGE_H
+ #define CU_TEX_IMAGE_H
+
+-class GLTexImage;
+-struct cudaArray;
+-struct textureReference;
+-
+-//using texture2D from linear memory
++#include
+
+-#define SIFTGPU_ENABLE_LINEAR_TEX2D
++class GLTexImage;
+
+ class CuTexImage
+ {
+@@ -45,18 +41,23 @@ class CuTexImage
+ int _texHeight;
+ GLuint _fromPBO;
+ public:
++ struct CuTexObj
++ {
++ cudaTextureObject_t handle;
++ ~CuTexObj();
++ };
++
+ virtual void SetImageSize(int width, int height);
+ virtual bool InitTexture(int width, int height, int nchannel = 1);
+- void InitTexture2D();
+- inline void BindTexture(textureReference& texRef);
+- inline void BindTexture2D(textureReference& texRef);
+- void CopyToTexture2D();
++ CuTexObj BindTexture(const cudaTextureDesc& textureDesc,
++ const cudaChannelFormatDesc& channelFmtDesc);
++ CuTexObj BindTexture2D(const cudaTextureDesc& textureDesc,
++ const cudaChannelFormatDesc& channelFmtDesc);
+ void CopyToHost(void* buf);
+ void CopyToHost(void* buf, int stream);
+ void CopyFromHost(const void* buf);
+ int CopyToPBO(GLuint pbo);
+ void CopyFromPBO(int width, int height, GLuint pbo);
+- static int DebugCopyToTexture2D();
+ public:
+ inline int GetImgWidth(){return _imgWidth;}
+ inline int GetImgHeight(){return _imgHeight;}
+diff --git a/lib/SiftGPU/ProgramCU.cu b/lib/SiftGPU/ProgramCU.cu
+index 9d842770d..0b99ad066 100644
+--- a/lib/SiftGPU/ProgramCU.cu
++++ b/lib/SiftGPU/ProgramCU.cu
+@@ -98,19 +98,33 @@
+
+
+ __device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
+-texture texData;
+-texture texDataB;
+-texture texDataF2;
+-texture texDataF4;
+-texture texDataI4;
+-texture texDataList;
+-
+-//template __device__ float Conv(float *data) { return Conv(data) + data[i]*d_kernel[i];}
+-//template<> __device__ float Conv<0>(float *data) { return data[0] * d_kernel[0]; }
+
++const static cudaTextureDesc texDataDesc = []() {
++ cudaTextureDesc textureDesc;
++ memset(&textureDesc, 0, sizeof(textureDesc));
++ textureDesc.readMode = cudaReadModeElementType;
++ textureDesc.addressMode[0] = cudaAddressModeClamp;
++ textureDesc.addressMode[1] = cudaAddressModeClamp;
++ textureDesc.addressMode[2] = cudaAddressModeClamp;
++ textureDesc.filterMode = cudaFilterModePoint;
++ textureDesc.normalizedCoords = false;
++ return textureDesc;
++}();
++
++const static cudaTextureDesc texDataBDesc = []() {
++ cudaTextureDesc textureDesc;
++ memset(&textureDesc, 0, sizeof(textureDesc));
++ textureDesc.readMode = cudaReadModeNormalizedFloat;
++ textureDesc.addressMode[0] = cudaAddressModeClamp;
++ textureDesc.addressMode[1] = cudaAddressModeClamp;
++ textureDesc.addressMode[2] = cudaAddressModeClamp;
++ textureDesc.filterMode = cudaFilterModePoint;
++ textureDesc.normalizedCoords = false;
++ return textureDesc;
++}();
+
+ //////////////////////////////////////////////////////////////
+-template __global__ void FilterH( float* d_result, int width)
++template __global__ void FilterH(cudaTextureObject_t texData, float* d_result, int width)
+ {
+
+ const int HALF_WIDTH = FW >> 1;
+@@ -130,7 +144,7 @@ template __global__ void FilterH( float* d_result, int width)
+ if(cache_index < CACHE_WIDTH)
+ {
+ int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
+- data[cache_index] = tex1Dfetch(texData,fetch_index);
++ data[cache_index] = tex1Dfetch(texData,fetch_index);
+ src_index += FILTERH_TILE_WIDTH;
+ cache_index += FILTERH_TILE_WIDTH;
+ }
+@@ -149,7 +163,7 @@ template __global__ void FilterH( float* d_result, int width)
+
+
+ ////////////////////////////////////////////////////////////////////
+-template __global__ void FilterV(float* d_result, int width, int height)
++template __global__ void FilterV(cudaTextureObject_t texData, float* d_result, int width, int height)
+ {
+ const int HALF_WIDTH = FW >> 1;
+ const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;
+@@ -188,7 +202,7 @@ template __global__ void FilterV(float* d_result, int width, int heigh
+ if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT)
+ {
+ int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
+- data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index);
++ data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index);
+ data_index += IMUL(FILTERV_BLOCK_HEIGHT, width);
+ }
+ }
+@@ -218,7 +232,7 @@ template __global__ void FilterV(float* d_result, int width, int heigh
+ }
+
+
+-template __global__ void UpsampleKernel(float* d_result, int width)
++template __global__ void UpsampleKernel(cudaTextureObject_t texData, float* d_result, int width)
+ {
+ const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);
+ const float INV_SCALE = 1.0f / (float(SCALE));
+@@ -232,11 +246,11 @@ template __global__ void UpsampleKernel(float* d_result, int widt
+ int helper = blockIdx.y & SCALE_MASK;
+ if (helper)
+ {
+- float v11 = tex1Dfetch(texData, index);
+- float v12 = tex1Dfetch(texData, index + 1);
++ float v11 = tex1Dfetch(texData, index);
++ float v12 = tex1Dfetch(texData, index + 1);
+ index += width;
+- float v21 = tex1Dfetch(texData, index);
+- float v22 = tex1Dfetch(texData, index + 1);
++ float v21 = tex1Dfetch(texData, index);
++ float v22 = tex1Dfetch(texData, index + 1);
+ float w1 = INV_SCALE * helper, w2 = 1.0 - w1;
+ float v1 = (v21 * w1 + w2 * v11);
+ float v2 = (v22 * w1 + w2 * v12);
+@@ -250,8 +264,8 @@ template __global__ void UpsampleKernel(float* d_result, int widt
+ }
+ }else
+ {
+- float v1 = tex1Dfetch(texData, index);
+- float v2 = tex1Dfetch(texData, index + 1);
++ float v1 = tex1Dfetch(texData, index);
++ float v2 = tex1Dfetch(texData, index + 1);
+ d_result[dst_idx] = v1;
+ #pragma unroll
+ for(int i = 1; i < SCALE; ++i)
+@@ -268,19 +282,19 @@ template __global__ void UpsampleKernel(float* d_result, int widt
+ void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)
+ {
+ int width = src->GetImgWidth(), height = src->GetImgHeight();
+- src->BindTexture(texData);
++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc());
+ dim3 grid((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
+ dim3 block(FILTERH_TILE_WIDTH);
+ switch(log_scale)
+ {
+- case 1 : UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width); break;
+- case 2 : UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width); break;
+- case 3 : UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width); break;
++ case 1 : UpsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
++ case 2 : UpsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
++ case 3 : UpsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, width); break;
+ default: break;
+ }
+ }
+
+-template __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width)
++template __global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width)
+ {
+ const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+ if(dst_col >= dst_width) return;
+@@ -289,11 +303,11 @@ template __global__ void DownsampleKernel(float* d_result, int sr
+ const int src_row = blockIdx.y << LOG_SCALE;
+ const int src_idx = IMUL(src_row, src_width) + src_col;
+ const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
+- d_result[dst_idx] = tex1Dfetch(texData, src_idx);
++ d_result[dst_idx] = tex1Dfetch(texData, src_idx);
+
+ }
+
+-__global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale)
++__global__ void DownsampleKernel(cudaTextureObject_t texData, float* d_result, int src_width, int dst_width, const int log_scale)
+ {
+ const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+ if(dst_col >= dst_width) return;
+@@ -302,7 +316,7 @@ __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width,
+ const int src_row = blockIdx.y << log_scale;
+ const int src_idx = IMUL(src_row, src_width) + src_col;
+ const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
+- d_result[dst_idx] = tex1Dfetch(texData, src_idx);
++ d_result[dst_idx] = tex1Dfetch(texData, src_idx);
+
+ }
+
+@@ -310,28 +324,28 @@ void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)
+ {
+ int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
+
+- src->BindTexture(texData);
++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc());
+ dim3 grid((dst_width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
+ dim3 block(FILTERH_TILE_WIDTH);
+ switch(log_scale)
+ {
+- case 1 : DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break;
+- case 2 : DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break;
+- case 3 : DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break;
+- default: DownsampleKernel <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale);
++ case 1 : DownsampleKernel<1> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
++ case 2 : DownsampleKernel<2> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
++ case 3 : DownsampleKernel<3> <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width); break;
++ default: DownsampleKernel <<< grid, block>>> (srcTex.handle, (float*) dst->_cuData, src_width, dst_width, log_scale);
+ }
+ }
+
+-__global__ void ChannelReduce_Kernel(float* d_result)
++__global__ void ChannelReduce_Kernel(cudaTextureObject_t texData, float* d_result)
+ {
+ int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+- d_result[index] = tex1Dfetch(texData, index*4);
++ d_result[index] = tex1Dfetch(texData, index*4);
+ }
+
+-__global__ void ChannelReduce_Convert_Kernel(float* d_result)
++__global__ void ChannelReduce_Convert_Kernel(cudaTextureObject_t texDataF4, float* d_result)
+ {
+ int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+- float4 rgba = tex1Dfetch(texDataF4, index);
++ float4 rgba = tex1Dfetch(texDataF4, index);
+ d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;
+ }
+
+@@ -343,19 +357,19 @@ void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int conv
+ dim3 block(FILTERH_TILE_WIDTH);
+ if(convert_rgb)
+ {
+- src->BindTexture(texDataF4);
+- ChannelReduce_Convert_Kernel<<>>((float*)dst->_cuData);
++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc());
++ ChannelReduce_Convert_Kernel<<>>(srcTex.handle, (float*)dst->_cuData);
+ }else
+ {
+- src->BindTexture(texData);
+- ChannelReduce_Kernel<<>>((float*)dst->_cuData);
++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc());
++ ChannelReduce_Kernel<<>>(srcTex.handle, (float*)dst->_cuData);
+ }
+ }
+
+-__global__ void ConvertByteToFloat_Kernel(float* d_result)
++__global__ void ConvertByteToFloat_Kernel(cudaTextureObject_t texDataB, float* d_result)
+ {
+ int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
+- d_result[index] = tex1Dfetch(texDataB, index);
++ d_result[index] = tex1Dfetch(texDataB, index);
+ }
+
+ void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
+@@ -363,8 +377,8 @@ void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
+ int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
+ dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
+ dim3 block(FILTERH_TILE_WIDTH);
+- src->BindTexture(texDataB);
+- ConvertByteToFloat_Kernel<<>>((float*)dst->_cuData);
++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataBDesc, cudaCreateChannelDesc());
++ ConvertByteToFloat_Kernel<<>>(srcTex.handle, (float*)dst->_cuData);
+ }
+
+ void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)
+@@ -403,17 +417,17 @@ template void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, C
+ int width = src->GetImgWidth(), height = src->GetImgHeight();
+
+ //horizontal filtering
+- src->BindTexture(texData);
++ CuTexImage::CuTexObj srcTex = src->BindTexture(texDataDesc, cudaCreateChannelDesc());
+ dim3 gridh((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
+ dim3 blockh(FILTERH_TILE_WIDTH);
+- FilterH<<>>((float*)buf->_cuData, width);
++ FilterH<<>>(srcTex.handle, (float*)buf->_cuData, width);
+ CheckErrorCUDA("FilterH");
+
+ ///vertical filtering
+- buf->BindTexture(texData);
++ CuTexImage::CuTexObj bufTex = buf->BindTexture(texDataDesc, cudaCreateChannelDesc());
+ dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH, (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);
+ dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT);
+- FilterV<<>>((float*)dst->_cuData, width, height);
++ FilterV<<>>(bufTex.handle, (float*)dst->_cuData, width, height);
+ CheckErrorCUDA("FilterV");
+ }
+
+@@ -450,24 +464,20 @@ void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, f
+ }
+
+
+-texture texC;
+-texture texP;
+-texture texN;
+-
+-void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height)
++void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, float2* d_got, int width, int height)
+ {
+ int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
+ int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
+ if(col < width && row < height)
+ {
+ int index = IMUL(row, width) + col;
+- float vp = tex1Dfetch(texP, index);
+- float v = tex1Dfetch(texC, index);
++ float vp = tex1Dfetch(texP, index);
++ float v = tex1Dfetch(texC, index);
+ d_dog[index] = v - vp;
+- float vxn = tex1Dfetch(texC, index + 1);
+- float vxp = tex1Dfetch(texC, index - 1);
+- float vyp = tex1Dfetch(texC, index - width);
+- float vyn = tex1Dfetch(texC, index + width);
++ float vxn = tex1Dfetch(texC, index + 1);
++ float vxp = tex1Dfetch(texC, index - 1);
++ float vyp = tex1Dfetch(texC, index - width);
++ float vyn = tex1Dfetch(texC, index + width);
+ float dx = vxn - vxp, dy = vyn - vyp;
+ float grd = 0.5f * sqrt(dx * dx + dy * dy);
+ float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));
+@@ -475,15 +485,15 @@ void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int he
+ }
+ }
+
+-void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height)
++void __global__ ComputeDOG_Kernel(cudaTextureObject_t texC, cudaTextureObject_t texP, float* d_dog, int width, int height)
+ {
+ int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
+ int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
+ if(col < width && row < height)
+ {
+ int index = IMUL(row, width) + col;
+- float vp = tex1Dfetch(texP, index);
+- float v = tex1Dfetch(texC, index);
++ float vp = tex1Dfetch(texP, index);
++ float v = tex1Dfetch(texC, index);
+ d_dog[index] = v - vp;
+ }
+ }
+@@ -493,19 +503,19 @@ void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
+ int width = gus->GetImgWidth(), height = gus->GetImgHeight();
+ dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX, (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);
+ dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);
+- gus->BindTexture(texC);
+- (gus -1)->BindTexture(texP);
++ CuTexImage::CuTexObj texCObj = gus->BindTexture(texDataDesc, cudaCreateChannelDesc());
++ CuTexImage::CuTexObj texPObj = (gus-1)->BindTexture(texDataDesc, cudaCreateChannelDesc());
+ if(got->_cuData)
+- ComputeDOG_Kernel<<>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);
++ ComputeDOG_Kernel<<>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, (float2*) got->_cuData, width, height);
+ else
+- ComputeDOG_Kernel<<>>((float*) dog->_cuData, width, height);
++ ComputeDOG_Kernel<<>>(texCObj.handle, texPObj.handle, (float*) dog->_cuData, width, height);
+ }
+
+
+ #define READ_CMP_DOG_DATA(datai, tex, idx) \
+- datai[0] = tex1Dfetch(tex, idx - 1);\
+- datai[1] = tex1Dfetch(tex, idx);\
+- datai[2] = tex1Dfetch(tex, idx + 1);\
++ datai[0] = tex1Dfetch(tex, idx - 1);\
++ datai[1] = tex1Dfetch(tex, idx);\
++ datai[2] = tex1Dfetch(tex, idx + 1);\
+ if(v > nmax)\
+ {\
+ nmax = max(nmax, datai[0]);\
+@@ -521,7 +531,7 @@ void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
+ }
+
+
+-void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax,
++void __global__ ComputeKEY_Kernel(cudaTextureObject_t texP, cudaTextureObject_t texC, cudaTextureObject_t texN, float4* d_key, int width, int colmax, int rowmax,
+ float dog_threshold0, float dog_threshold, float edge_threshold, int subpixel_localization)
+ {
+ float data[3][3], v;
+@@ -546,11 +556,11 @@ void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowm
+ #endif
+ {
+ in_image = 1;
+- data[1][1] = v = tex1Dfetch(texC, idx[1]);
++ data[1][1] = v = tex1Dfetch(texC, idx[1]);
+ if(fabs(v) <= dog_threshold0) goto key_finish;
+
+- data[1][0] = tex1Dfetch(texC, idx[1] - 1);
+- data[1][2] = tex1Dfetch(texC, idx[1] + 1);
++ data[1][0] = tex1Dfetch(texC, idx[1] - 1);
++ data[1][2] = tex1Dfetch(texC, idx[1] + 1);
+ nmax = max(data[1][0], data[1][2]);
+ nmin = min(data[1][0], data[1][2]);
+
+@@ -651,18 +661,18 @@ void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float T
+ dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
+ #endif
+ dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);
+- dogp->BindTexture(texP);
+- dog ->BindTexture(texC);
+- dogn->BindTexture(texN);
++ CuTexImage::CuTexObj texPObj = dogp->BindTexture(texDataDesc, cudaCreateChannelDesc());
++ CuTexImage::CuTexObj texCObj = dog->BindTexture(texDataDesc, cudaCreateChannelDesc());
++ CuTexImage::CuTexObj texNObj = dogn->BindTexture(texDataDesc, cudaCreateChannelDesc());
+ Tedge = (Tedge+1)*(Tedge+1)/Tedge;
+- ComputeKEY_Kernel<<>>((float4*) key->_cuData, width,
++ ComputeKEY_Kernel<<>>(texPObj.handle, texCObj.handle, texNObj.handle, (float4*) key->_cuData, width,
+ width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);
+
+ }
+
+
+
+-void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)
++void __global__ InitHist_Kernel(cudaTextureObject_t texDataF4, int4* hist, int ws, int wd, int height)
+ {
+ int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
+ int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+@@ -677,7 +687,7 @@ void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)
+ #pragma unroll
+ for(int i = 0; i < 4 ; ++i, ++scol)
+ {
+- float4 temp = tex1Dfetch(texDataF4, sidx +i);
++ float4 temp = tex1Dfetch(texDataF4, sidx +i);
+ v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;
+ }
+ }
+@@ -694,13 +704,13 @@ void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
+ int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
+ dim3 grid((wd + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH, hd);
+ dim3 block(HIST_INIT_WIDTH, 1);
+- key->BindTexture(texDataF4);
+- InitHist_Kernel<<>>((int4*) hist->_cuData, ws, wd, hd);
++ CuTexImage::CuTexObj keyTex = key->BindTexture(texDataDesc, cudaCreateChannelDesc());
++ InitHist_Kernel<<>>(keyTex.handle, (int4*) hist->_cuData, ws, wd, hd);
+ }
+
+
+
+-void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)
++void __global__ ReduceHist_Kernel(cudaTextureObject_t texDataI4, int4* d_hist, int ws, int wd, int height)
+ {
+ int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
+ int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+@@ -713,7 +723,7 @@ void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)
+ #pragma unroll
+ for(int i = 0; i < 4 && scol < ws; ++i, ++scol)
+ {
+- int4 temp = tex1Dfetch(texDataI4, sidx + i);
++ int4 temp = tex1Dfetch(texDataI4, sidx + i);
+ v[i] = temp.x + temp.y + temp.z + temp.w;
+ }
+ d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
+@@ -726,21 +736,21 @@ void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
+ int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
+ int temp = (int)floorf(logf(float(wd * 2/ 3)) / logf(2.0f));
+ const int wi = min(7, max(temp , 0));
+- hist1->BindTexture(texDataI4);
++ CuTexImage::CuTexObj hist1Tex = hist1->BindTexture(texDataDesc, cudaCreateChannelDesc());
+
+ const int BW = 1 << wi, BH = 1 << (7 - wi);
+ dim3 grid((wd + BW - 1)/ BW, (hd + BH -1) / BH);
+ dim3 block(BW, BH);
+- ReduceHist_Kernel<<>>((int4*)hist2->_cuData, ws, wd, hd);
++ ReduceHist_Kernel<<>>(hist1Tex.handle, (int4*)hist2->_cuData, ws, wd, hd);
+ }
+
+
+-void __global__ ListGen_Kernel(int4* d_list, int list_len, int width)
++void __global__ ListGen_Kernel(cudaTextureObject_t texDataList, cudaTextureObject_t texDataI4, int4* d_list, int list_len, int width)
+ {
+ int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
+- int4 pos = tex1Dfetch(texDataList, idx1);
++ int4 pos = tex1Dfetch(texDataList, idx1);
+ int idx2 = IMUL(pos.y, width) + pos.x;
+- int4 temp = tex1Dfetch(texDataI4, idx2);
++ int4 temp = tex1Dfetch(texDataI4, idx2);
+ int sum1 = temp.x + temp.y;
+ int sum2 = sum1 + temp.z;
+ pos.x <<= 2;
+@@ -766,15 +776,18 @@ void __global__ ListGen_Kernel(int4* d_list, int list_len, int width)
+ void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
+ {
+ int len = list->GetImgWidth();
+- list->BindTexture(texDataList);
+- hist->BindTexture(texDataI4);
++ CuTexImage::CuTexObj listTex = list->BindTexture(texDataDesc, cudaCreateChannelDesc());
++ CuTexImage::CuTexObj histTex = hist->BindTexture(texDataDesc, cudaCreateChannelDesc());
+ dim3 grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
+ dim3 block(LISTGEN_BLOCK_DIM);
+- ListGen_Kernel<<>>((int4*) list->_cuData, len,
++ ListGen_Kernel<<>>(listTex.handle, histTex.handle, (int4*) list->_cuData, len,
+ hist->GetImgWidth());
+ }
+
+-void __global__ ComputeOrientation_Kernel(float4* d_list,
++void __global__ ComputeOrientation_Kernel(cudaTextureObject_t texDataF2,
++ cudaTextureObject_t texDataF4,
++ cudaTextureObject_t texDataList,
++ float4* d_list,
+ int list_len,
+ int width, int height,
+ float sigma, float sigma_step,
+@@ -791,16 +804,16 @@ void __global__ ComputeOrientation_Kernel(float4* d_list,
+ float4 key;
+ if(existing_keypoint)
+ {
+- key = tex1Dfetch(texDataF4, idx);
++ key = tex1Dfetch(texDataF4, idx);
+ }else
+ {
+- int4 ikey = tex1Dfetch(texDataList, idx);
++ int4 ikey = tex1Dfetch(texDataList, idx);
+ key.x = ikey.x + 0.5f;
+ key.y = ikey.y + 0.5f;
+ key.z = sigma;
+ if(subpixel || keepsign)
+ {
+- float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x);
++ float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x);
+ if(subpixel)
+ {
+ key.x += offset.y;
+@@ -835,7 +848,7 @@ void __global__ ComputeOrientation_Kernel(float4* d_list,
+ float dy = y - key.y;
+ float sq_dist = dx * dx + dy * dy;
+ if(sq_dist >= dist_threshold) continue;
+- float2 got = tex2D(texDataF2, x, y);
++ float2 got = tex2D(texDataF2, x, y);
+ float weight = got.x * exp(sq_dist * factor);
+ float fidx = floorf(got.y * ten_degree_per_radius);
+ int oidx = fidx;
+@@ -943,21 +956,31 @@ void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage
+ int len = list->GetImgWidth();
+ if(len <= 0) return;
+ int width = got->GetImgWidth(), height = got->GetImgHeight();
++ CuTexImage::CuTexObj texObjF4;
++ CuTexImage::CuTexObj texObjList;
+ if(existing_keypoint)
+ {
+- list->BindTexture(texDataF4);
++ texObjF4 = list->BindTexture(texDataDesc, cudaCreateChannelDesc());
+ }else
+ {
+- list->BindTexture(texDataList);
+- if(GlobalUtil::_SubpixelLocalization) key->BindTexture(texDataF4);
++ texObjList = list->BindTexture(texDataDesc, cudaCreateChannelDesc());
++ if(GlobalUtil::_SubpixelLocalization)
++ {
++ texObjF4 = key->BindTexture(texDataDesc, cudaCreateChannelDesc |